From e8dbcfbe099bc668c9d3424e1ff6c7638ea62416 Mon Sep 17 00:00:00 2001 From: Lillian Skinner <56081713+rvtr@users.noreply.github.com> Date: Tue, 17 Jun 2025 17:27:24 -0400 Subject: [PATCH] --- SCRAPE_MEETINGS.SH | 329 ++++++++++++++++++++++----------------------- 1 file changed, 159 insertions(+), 170 deletions(-) diff --git a/SCRAPE_MEETINGS.SH b/SCRAPE_MEETINGS.SH index b8cc1b4..2d05f74 100644 --- a/SCRAPE_MEETINGS.SH +++ b/SCRAPE_MEETINGS.SH @@ -1,4 +1,4 @@ -#/bash +#!/bin/bash # Hey folks, please do not run this script more than necessary. # Too many search requests will temporarily block searches for everyone, not just you. # I do not want to DDOS London. I just want to allow for personal backups. Cheers! @@ -39,176 +39,173 @@ SEARCH_FORMAT_DATE="f[0]=meeting_date%3A" SEARCH_FORMAT_QUERY="search=query&sort_by=field_meeting_date" # As far as I'm aware there are no meetings prior to 2011. -current_year=$(date +%Y) -current_month=$(date +%m) -current_day=$(date +%d) -i=$(date +%Y) +i=2011 x=$((i + 1)) echo $x SEARCH_END="FALSE" while (( i < x )); do - j=0 - SEARCH_END="FALSE" - while [[ $SEARCH_END == "FALSE" ]]; do - echo "SCRAPE_MEETINGS: Downloading search results... Page $j of $i" - wget --user-agent="$WGET_UA" $SEARCH_URL"?$SEARCH_FORMAT_DATE$i&page=$j" -O $SEARCH_PAGE -q #--show-progress - if [ $? -ne 8 ]; then - FOUNDMEETING="FALSE" + j=0 + SEARCH_END="FALSE" + while [[ $SEARCH_END == "FALSE" ]]; do + echo "SCRAPE_MEETINGS: Downloading search results... Page $j of $i" + wget --user-agent="$WGET_UA" $SEARCH_URL"?$SEARCH_FORMAT_DATE$i&page=$j" -O $SEARCH_PAGE -q #--show-progress + if [ $? -ne 8 ]; then + FOUNDMEETING="FALSE" - GREP404=$(cat $SEARCH_PAGE | grep "No results found.") - if [[ "$GREP404" == "" ]]; then - while IFS= read -r LINE; do + GREP404=$(cat $SEARCH_PAGE | grep "No results found.") + if [[ "$GREP404" == "" ]]; then + while IFS= read -r LINE; do - # All meeting items in the search results are formatted like so: - # - One line with the name - # - Second line with all other info including links - # - # We can find the first line by the class "views-field-field-meeting-notes" - # FOUNDMEETING=TRUE will show that the first line has been found, and so the next line read will be "confirmed" as line 2 of the meeting info - # The first two links of every second line are (in order) the PDF and HTML agendas + # All meeting items in the search results are formatted like so: + # - One line with the name + # - Second line with all other info including links + # + # We can find the first line by the class "views-field-field-meeting-notes" + # FOUNDMEETING=TRUE will show that the first line has been found, and so the next line read will be "confirmed" as line 2 of the meeting info + # The first two links of every second line are (in order) the PDF and HTML agendas - if [[ "TRUE" == $FOUNDMEETING ]]; then - FOUNDMEETING="FALSE" - echo "SCRAPE_MEETINGS: -========================================================================-" - echo "SCRAPE_MEETINGS: Working on $MEETING_NAME ($MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY)" - echo "SCRAPE_MEETINGS: All files to be saved as "$MEETING_NAME"/"$MEETING_YEAR"/"$MEETING_MONTH"-"$MEETING_DAY"/" - echo "SCRAPE_MEETINGS: -========================================================================-" - echo "SCRAPE_MEETINGS: Task starting on: $(date)" + if [[ "TRUE" == $FOUNDMEETING ]]; then + FOUNDMEETING="FALSE" + echo "SCRAPE_MEETINGS: -========================================================================-" + echo "SCRAPE_MEETINGS: Working on $MEETING_NAME ($MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY)" + echo "SCRAPE_MEETINGS: All files to be saved as "$MEETING_NAME"/"$MEETING_YEAR"/"$MEETING_MONTH"-"$MEETING_DAY"/" + echo "SCRAPE_MEETINGS: -========================================================================-" + echo "SCRAPE_MEETINGS: Task starting on: $(date)" - #echo "CANCEL NOW!!!" - #sleep 5 + #echo "CANCEL NOW!!!" + #sleep 5 - # Grab meeting item links - echo $LINE | sed 's/href=./\nhref="/g' | grep 'href="https' | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq > "./tmp/meeting_urls" - # Grab meeting item types - echo $LINE | sed 's/rel=.noreferrer.>/\nrel="noreferrer">/g' | grep 'rel="noreferrer">' | sed 's/.*rel="noreferrer">\([^<]*\)<.*/\1/p' | uniq > "./tmp/meeting_types" + # Grab meeting item links + echo $LINE | sed 's/href=./\nhref="/g' | grep 'href="https' | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq > "./tmp/meeting_urls" + # Grab meeting item types + echo $LINE | sed 's/rel=.noreferrer.>/\nrel="noreferrer">/g' | grep 'rel="noreferrer">' | sed 's/.*rel="noreferrer">\([^<]*\)<.*/\1/p' | uniq > "./tmp/meeting_types" - AGENDA_HTML_URL="" - AGENDA_PDF_URL="" - AGENDA_REVISE_HTML_URL="" - AGENDA_REVISE_PDF_URL="" - MINUTES_HTML_URL="" - MINUTES_PDF_URL="" - MINUTES_ATTACH_PDF_URL="" + AGENDA_HTML_URL="" + AGENDA_PDF_URL="" + AGENDA_REVISE_HTML_URL="" + AGENDA_REVISE_PDF_URL="" + MINUTES_HTML_URL="" + MINUTES_PDF_URL="" + MINUTES_ATTACH_PDF_URL="" - echo "SCRAPE_MEETINGS: Found the following documents:" - while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do - echo "SCRAPE_MEETINGS: - $LINEA2" + echo "SCRAPE_MEETINGS: Found the following documents:" + while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do + echo "SCRAPE_MEETINGS: - $LINEA2" - case "$LINEA2" in - "Agenda (HTML) ") - AGENDA_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; - "Agenda (PDF) ") - AGENDA_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; - "Revised Agenda (HTML) ") - AGENDA_REVISE_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; - "Revised Agenda (PDF) ") - AGENDA_REVISE_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; - "Minutes (HTML) ") - MINUTES_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; - "Minutes (PDF) ") - MINUTES_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; - "Minutes with Attachments (PDF) ") - MINUTES_ATTACH_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; - esac + case "$LINEA2" in + "Agenda (HTML) ") + AGENDA_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; + "Agenda (PDF) ") + AGENDA_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; + "Revised Agenda (HTML) ") + AGENDA_REVISE_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; + "Revised Agenda (PDF) ") + AGENDA_REVISE_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; + "Minutes (HTML) ") + MINUTES_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; + "Minutes (PDF) ") + MINUTES_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; + "Minutes with Attachments (PDF) ") + MINUTES_ATTACH_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; + esac - done < ./tmp/meeting_urls 3< ./tmp/meeting_types + done < ./tmp/meeting_urls 3< ./tmp/meeting_types - # Always prefer Revised Agendas - echo "SCRAPE_MEETINGS: Downloading agenda HTML..." - if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then - wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress - elif [[ $AGENDA_HTML_URL != "" ]]; then - wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress - else - ERROR="TRUE" - fi + # Always prefer Revised Agendas + echo "SCRAPE_MEETINGS: Downloading agenda HTML..." + if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then + wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress + elif [[ $AGENDA_HTML_URL != "" ]]; then + wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress + else + ERROR="TRUE" + fi - if [[ ERROR="FALSE" ]]; then + if [[ ERROR="FALSE" ]]; then mkdir "./LondonArchive" mkdir "./LondonArchive/Meetings" - if [ ! -d "./LondonArchive/Meetings/$MEETING_NAME" ]; then - mkdir "./LondonArchive/Meetings/$MEETING_NAME/" - fi - if [ ! -d "./LondonArchive/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then - mkdir "./LondonArchive/Meetings/$MEETING_NAME/$MEETING_YEAR/" - fi - MEETING_DIR=$(printf "./LondonArchive/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY") - if [ ! -d "$MEETING_DIR" ]; then - mkdir "$MEETING_DIR/" - fi - if [ ! -d "$MEETING_DIR/Attachments" ]; then - mkdir "$MEETING_DIR/Attachments/" - fi - - # Direct video links is always "video.isilive.ca//" - # There are some eScribe ones, but those are in m3u8s and are really annoying to work with + if [ ! -d "./LondonArchive/Meetings/$MEETING_NAME" ]; then + mkdir "./LondonArchive/Meetings/$MEETING_NAME/" + fi + if [ ! -d "./LondonArchive/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then + mkdir "./LondonArchive/Meetings/$MEETING_NAME/$MEETING_YEAR/" + fi + MEETING_DIR=$(printf "./LondonArchive/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY") + if [ ! -d "$MEETING_DIR" ]; then + mkdir "$MEETING_DIR/" + fi + if [ ! -d "$MEETING_DIR/Attachments" ]; then + mkdir "$MEETING_DIR/Attachments/" + fi + + # Direct video links is always "video.isilive.ca//" + # There are some eScribe ones, but those are in m3u8s and are really annoying to work with - # ...not annoying as more sed though. - VIDEO_URL=$(grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g') - - if [[ $VIDEO_URL != "" ]]; then - echo "SCRAPE_MEETINGS: Saving recording URL..." - echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt" - fi - - # Get attachment links - cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls" - # Get attachment names - cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names" - # Download attachment and use the name grabbed above - echo "SCRAPE_MEETINGS: Found the following agenda attachments:" - while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do - echo "SCRAPE_MEETINGS: - $LINEA2" - wget --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -q #--show-progress - done < ./tmp/attachment_urls 3< ./tmp/attachment_names - echo "SCRAPE_MEETINGS: All attachments saved." + # ...not annoying as more sed though. + VIDEO_URL=$(grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g') + + if [[ $VIDEO_URL != "" ]]; then + echo "SCRAPE_MEETINGS: Saving recording URL..." + echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt" + fi + + # Get attachment links + cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls" + # Get attachment names + cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names" + # Download attachment and use the name grabbed above + echo "SCRAPE_MEETINGS: Found the following agenda attachments:" + while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do + echo "SCRAPE_MEETINGS: - $LINEA2" + wget --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -q #--show-progress + done < ./tmp/attachment_urls 3< ./tmp/attachment_names + echo "SCRAPE_MEETINGS: All attachments saved." - if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then - if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then - echo "SCRAPE_MEETINGS: Saving revised agenda as PDF..." - wget --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$MEETING_DIR/Agenda_Revised.pdf" -q #--show-progress - fi - if [[ $AGENDA_PDF_URL != "" ]]; then - echo "SCRAPE_MEETINGS: Saving regular agenda as PDF..." - wget --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$MEETING_DIR/Agenda.pdf" -q #--show-progress - fi - else - if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then - echo "SCRAPE_MEETINGS: Saving revised agenda as HTML... (no PDF found!)" - wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$MEETING_DIR/Agenda_Revised.html" -q #--show-progress - fi - if [[ $AGENDA_HTML_URL != "" ]]; then - echo "SCRAPE_MEETINGS: Saving regular agenda as HTML... (no PDF found!)" - wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$MEETING_DIR/Agenda.html" -q #--show-progress - fi - fi - - if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then - if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then - echo "SCRAPE_MEETINGS: Saving minutes with attachments as PDF..." - wget --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$MEETING_DIR/Minutes_With_Attachments.pdf" -q #--show-progress - fi - if [[ $MINUTES_PDF_URL != "" ]]; then - echo "SCRAPE_MEETINGS: Saving minutes as PDF..." - wget --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$MEETING_DIR/Minutes.pdf" -q #--show-progress - fi - else - if [[ $MINUTES_HTML_URL != "" ]]; then - echo "SCRAPE_MEETINGS: Saving minutes as HTML... (no PDF found!)" - wget --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$MEETING_DIR/Minutes.html" -q #--show-progress - fi - fi + if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then + if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then + echo "SCRAPE_MEETINGS: Saving revised agenda as PDF..." + wget --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$MEETING_DIR/Agenda_Revised.pdf" -q #--show-progress + fi + if [[ $AGENDA_PDF_URL != "" ]]; then + echo "SCRAPE_MEETINGS: Saving regular agenda as PDF..." + wget --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$MEETING_DIR/Agenda.pdf" -q #--show-progress + fi + else + if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then + echo "SCRAPE_MEETINGS: Saving revised agenda as HTML... (no PDF found!)" + wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$MEETING_DIR/Agenda_Revised.html" -q #--show-progress + fi + if [[ $AGENDA_HTML_URL != "" ]]; then + echo "SCRAPE_MEETINGS: Saving regular agenda as HTML... (no PDF found!)" + wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$MEETING_DIR/Agenda.html" -q #--show-progress + fi + fi + + if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then + if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then + echo "SCRAPE_MEETINGS: Saving minutes with attachments as PDF..." + wget --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$MEETING_DIR/Minutes_With_Attachments.pdf" -q #--show-progress + fi + if [[ $MINUTES_PDF_URL != "" ]]; then + echo "SCRAPE_MEETINGS: Saving minutes as PDF..." + wget --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$MEETING_DIR/Minutes.pdf" -q #--show-progress + fi + else + if [[ $MINUTES_HTML_URL != "" ]]; then + echo "SCRAPE_MEETINGS: Saving minutes as HTML... (no PDF found!)" + wget --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$MEETING_DIR/Minutes.html" -q #--show-progress + fi + fi - fi - echo "SCRAPE_MEETINGS: All files from this meeting have been saved." - fi - - GREPMEETING=$(echo $LINE | grep "views-field-field-meeting-notes") - if [[ "$GREPMEETING" != "" ]]; then - MEETING_INFO=$(echo $LINE | sed -n 's/.*
\([^<]*\)<\/div>.*/\1/p') + fi + echo "SCRAPE_MEETINGS: All files from this meeting have been saved." + fi + + GREPMEETING=$(echo $LINE | grep "views-field-field-meeting-notes") + if [[ "$GREPMEETING" != "" ]]; then + MEETING_INFO=$(echo $LINE | sed -n 's/.*
\([^<]*\)<\/div>.*/\1/p') MEETING_MONTH_WORD=$(echo "$MEETING_INFO" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_DAY_SHORT=$(echo "$MEETING_INFO" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') @@ -231,27 +228,19 @@ while (( i < x )); do *) MEETING_MONTH="--" ;; esac - if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then - #echo "date is greater" - FOUNDMEETING="TRUE" - else - echo "SCRAPE_MEETINGS: Dates are in the past! Abort." - SEARCH_END="TRUE" - break - fi - - fi - done < $SEARCH_PAGE - else - SEARCH_END="TRUE" - echo "SCRAPE_MEETINGS: No more pages!" - fi - else - SEARCH_END="TRUE" - echo "SCRAPE_MEETINGS: No more pages!" - fi - ((j++)) - done + FOUNDMEETING="TRUE" + fi + done < $SEARCH_PAGE + else + SEARCH_END="TRUE" + echo "SCRAPE_MEETINGS: No more pages!" + fi + else + SEARCH_END="TRUE" + echo "SCRAPE_MEETINGS: No more pages!" + fi + ((j++)) + done ((i++)) done echo "Done job: SCRAPE_MEETINGS: $(date)"