From f7824989f9090b548b4523412e89d0d933ffab08 Mon Sep 17 00:00:00 2001 From: Lillian Skinner <56081713+rvtr@users.noreply.github.com> Date: Fri, 16 May 2025 20:03:35 -0400 Subject: [PATCH] --- SCRAPE_LONDON.SH | 145 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 116 insertions(+), 29 deletions(-) diff --git a/SCRAPE_LONDON.SH b/SCRAPE_LONDON.SH index 8be4895..8a11314 100644 --- a/SCRAPE_LONDON.SH +++ b/SCRAPE_LONDON.SH @@ -3,6 +3,9 @@ echo -e "\n-==================================================================== echo -e "-=- -=-" echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-" echo -e "-=- -=-" +echo -e "-=- https://gist.github.com/rvtr/1b471e5f5215c368fd78d9aba05f8dc2 -=-" +echo -e "-=- Lillian Skinner -=-" +echo -e "-=- -=-" echo -e "-========================================================================-" # Warning to all who read this script: @@ -32,7 +35,7 @@ mkdir $AGENDA_DIR mkdir $VIDEO_DIR SEARCH_URL="https://london.ca/government/council-civic-administration/council-committee-meetings/meetings" -# Need to confirm. When stacking params does the date need to be ?f[1]? +# Need to confirm. When stacking params does the date need to be f[1]? SEARCH_PARAM_COMMITTEE="f[0]=meeting_type%3A" SEARCH_PARAM_DATE="f[0]=meeting_date%3A" SEARCH_PARAM_QUERY="search=query&sort_by=field_meeting_date" @@ -40,7 +43,8 @@ SEARCH_PARAM_QUERY="search=query&sort_by=field_meeting_date" i=0 SEARCH_END="FALSE" while [[ $SEARCH_END == "FALSE" ]]; do - wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$i" -O $SEARCH_PAGE + echo "Downloading search results... Page: $i" + wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$i" -O $SEARCH_PAGE -q #--show-progress if [ $? -ne 8 ]; then FOUNDMEETING="FALSE" @@ -58,43 +62,126 @@ while [[ $SEARCH_END == "FALSE" ]]; do if [[ "TRUE" == $FOUNDMEETING ]]; then FOUNDMEETING="FALSE" + echo "-========================================================================-" + echo " Working on $MEETING_NAME" + echo "-========================================================================-" - AGENDA_URLS=$(echo $LINE | sed 's/ "./tmp/meeting_urls" + # Grab meeting item types + echo $LINE | sed 's/rel=.noreferrer.>/\nrel="noreferrer">/g' | grep 'rel="noreferrer">' | sed 's/.*rel="noreferrer">\([^<]*\)<.*/\1/p' | uniq > "./tmp/meeting_types" - echo "Working on: "$AGENDA_HTML_URL"/n"$AGENDA_PDF_URL - # sleep 1 # London please don't block me! - wget --user-agent="$WGET_UA" $AGENDA_HTML_URL -O $AGENDA_HTML + AGENDA_HTML_URL="" + AGENDA_PDF_URL="" + AGENDA_REVISE_HTML_URL="" + AGENDA_REVISE_PDF_URL="" + MINUTES_HTML_URL="" + MINUTES_PDF_URL="" + MINUTES_ATTACH_PDF_URL="" - # Direct video links is always "video.isilive.ca//" - # There are some eScribe ones, but those are in m3u8s and are really annoying to work with - - # ...not annoying as more sed though. - VIDEO_URL=$(printf "https://video.isilive.ca/london/"; grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g') - - # Finalize everything - mkdir "./tmp/$MEETING_NAME/" - mkdir "./tmp/$MEETING_NAME/Attachments/" - wget --user-agent="$WGET_UA" $AGENDA_PDF_URL -O "./tmp/$MEETING_NAME/Agenda.pdf" - echo $VIDEO_URL >> "./tmp/$MEETING_NAME/RecordingLink.txt" - # Get attachment links - cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls" - # Get attachment names - cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.\([^'\''/]*\)'\''.*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_names" - # Download attachment and use the name grabbed above + echo "Found the following documents:" while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do - wget --user-agent="$WGET_UA" https://pub-london.escribemeetings.com/$LINEA1 -O "./tmp/$MEETING_NAME/Attachments/$LINEA2" - echo "Here are the datas: https://pub-london.escribemeetings.com/$LINEA1/n$LINEA2" - done < ./tmp/attachment_urls 3< ./tmp/attachment_names + echo " - $LINEA2" + + case "$LINEA2" in + "Agenda (HTML) ") + AGENDA_HTML_URL="$LINEA1" ;; + "Agenda (PDF) ") + AGENDA_PDF_URL="$LINEA1" ;; + "Revised Agenda (HTML) ") + AGENDA_REVISE_HTML_URL="$LINEA1" ;; + "Revised Agenda (PDF) ") + AGENDA_REVISE_PDF_URL="$LINEA1" ;; + "Minutes (HTML) ") + MINUTES_HTML_URL="$LINEA1" ;; + "Minutes (PDF) ") + MINUTES_PDF_URL="$LINEA1" ;; + "Minutes with Attachments (PDF) ") + MINUTES_ATTACH_PDF_URL="$LINEA1" ;; + esac + + done < ./tmp/meeting_urls 3< ./tmp/meeting_types + + # Always prefer Revised Agendas + echo "Downloading agenda HTML..." + if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then + wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress + elif [[ $AGENDA_HTML_URL != "" ]]; then + wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress + else + ERROR="TRUE" + fi + + if [[ ERROR="FALSE" ]]; then + mkdir "./tmp/$MEETING_NAME/" + mkdir "./tmp/$MEETING_NAME/Attachments/" + + # Direct video links is always "video.isilive.ca//" + # There are some eScribe ones, but those are in m3u8s and are really annoying to work with + + # ...not annoying as more sed though. + VIDEO_URL=$(grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g') + + if [[ $VIDEO_URL != "" ]]; then + echo "Found meeting recording." + echo "https://video.isilive.ca/london/"$VIDEO_URL > "./tmp/$MEETING_NAME/RecordingLink.txt" + fi + + # Get attachment links + cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls" + # Get attachment names + cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.\([^'\''/]*\)'\''.*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_names" + # Download attachment and use the name grabbed above + echo "Found the following agenda attachments:" + while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do + echo " - $LINEA2" + wget --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "./tmp/$MEETING_NAME/Attachments/$LINEA2" -q #--show-progress + done < ./tmp/attachment_urls 3< ./tmp/attachment_names + echo "All attachments saved." + + if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then + if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then + echo "Saving revised agenda..." + wget --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "./tmp/$MEETING_NAME/Agenda_Revised.pdf" -q #--show-progress + fi + if [[ $AGENDA_PDF_URL != "" ]]; then + echo "Saving regular agenda..." + wget --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "./tmp/$MEETING_NAME/Agenda.pdf" -q #--show-progress + fi + else + if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then + echo "Saving revised agenda as HTML (no PDF found!)" + wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "./tmp/$MEETING_NAME/Agenda_Revised.html" -q #--show-progress + fi + if [[ $AGENDA_HTML_URL != "" ]]; then + echo "Saving regular agenda as HTML (no PDF found!)" + wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "./tmp/$MEETING_NAME/Agenda.html" -q #--show-progress + fi + fi + + if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then + if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then + echo "Saving minutes with attachments..." + wget --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "./tmp/$MEETING_NAME/Minutes_With_Attachments.pdf" -q #--show-progress + fi + if [[ $MINUTES_PDF_URL != "" ]]; then + echo "Saving minutes..." + wget --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "./tmp/$MEETING_NAME/Minutes.pdf" -q #--show-progress + fi + else + if [[ $MINUTES_HTML_URL != "" ]]; then + echo "Saving minutes as HTML (no PDF found!)" + wget --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "./tmp/$MEETING_NAME/Minutes.html" -q #--show-progress + fi + fi + + fi fi GREPMEETING=$(echo $LINE | grep "views-field-field-meeting-notes") if [[ "$GREPMEETING" != "" ]]; then FOUNDMEETING="TRUE" MEETING_NAME=$(echo $LINE | sed -n 's/.*
\([^<]*\)<\/div>.*/\1/p') - echo $MEETING_NAME fi done < $SEARCH_PAGE else