From f7824989f9090b548b4523412e89d0d933ffab08 Mon Sep 17 00:00:00 2001
From: Lillian Skinner <56081713+rvtr@users.noreply.github.com>
Date: Fri, 16 May 2025 20:03:35 -0400
Subject: [PATCH]

---
 SCRAPE_LONDON.SH | 145 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 116 insertions(+), 29 deletions(-)

diff --git a/SCRAPE_LONDON.SH b/SCRAPE_LONDON.SH
index 8be4895..8a11314 100644
--- a/SCRAPE_LONDON.SH
+++ b/SCRAPE_LONDON.SH
@@ -3,6 +3,9 @@ echo -e "\n-====================================================================
 echo -e "-=-                                                                    -=-"
 echo -e "-=-      SCRAPE_LONDON.SH: Downloads committee videos and agendas      -=-"
 echo -e "-=-                                                                    -=-"
+echo -e "-=-    https://gist.github.com/rvtr/1b471e5f5215c368fd78d9aba05f8dc2   -=-"
+echo -e "-=-     Lillian Skinner                                                -=-"
+echo -e "-=-                                                                    -=-"
 echo -e "-========================================================================-"
 
 # Warning to all who read this script:
@@ -32,7 +35,7 @@ mkdir $AGENDA_DIR
 mkdir $VIDEO_DIR
 
 SEARCH_URL="https://london.ca/government/council-civic-administration/council-committee-meetings/meetings"
-# Need to confirm. When stacking params does the date need to be ?f[1]?
+# Need to confirm. When stacking params does the date need to be f[1]?
 SEARCH_PARAM_COMMITTEE="f[0]=meeting_type%3A"
 SEARCH_PARAM_DATE="f[0]=meeting_date%3A"
 SEARCH_PARAM_QUERY="search=query&sort_by=field_meeting_date"
@@ -40,7 +43,8 @@ SEARCH_PARAM_QUERY="search=query&sort_by=field_meeting_date"
 i=0
 SEARCH_END="FALSE"
 while [[ $SEARCH_END == "FALSE" ]]; do
-        wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$i" -O $SEARCH_PAGE
+    echo "Downloading search results... Page: $i"
+        wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$i" -O $SEARCH_PAGE -q #--show-progress
         if [ $? -ne 8 ]; then
                 FOUNDMEETING="FALSE"
 
@@ -58,43 +62,126 @@ while [[ $SEARCH_END == "FALSE" ]]; do
 
                                 if [[ "TRUE" == $FOUNDMEETING ]]; then
                                         FOUNDMEETING="FALSE"
+                                        echo "-========================================================================-"
+                                        echo " Working on $MEETING_NAME"
+                                        echo "-========================================================================-"
 
-                                        AGENDA_URLS=$(echo $LINE | sed 's/<a href="/\n<a href="/g' | grep '<a href="' | sed -n 's/.*<a href="\([^"]*\)".*/\1/p')
-                                        AGENDA_PDF_URL=$(echo "$AGENDA_URLS" | sed -n '1p')
-                                        AGENDA_HTML_URL=$(echo "$AGENDA_URLS" | sed -n '2p')
-                                        echo $AGENDA_URLS
+                                        # Grab meeting item links
+                                        echo $LINE | sed 's/href=./\nhref="/g' | grep 'href="https' | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq > "./tmp/meeting_urls"
+                                        # Grab meeting item types
+                                        echo $LINE | sed 's/rel=.noreferrer.>/\nrel="noreferrer">/g' | grep 'rel="noreferrer">' | sed 's/.*rel="noreferrer">\([^<]*\)<.*/\1/p' | uniq > "./tmp/meeting_types"
 
-                                        echo "Working on: "$AGENDA_HTML_URL"/n"$AGENDA_PDF_URL
-                                        # sleep 1 # London please don't block me!
-                                        wget --user-agent="$WGET_UA" $AGENDA_HTML_URL -O $AGENDA_HTML
+                                        AGENDA_HTML_URL=""
+                                        AGENDA_PDF_URL=""
+                                        AGENDA_REVISE_HTML_URL=""
+                                        AGENDA_REVISE_PDF_URL=""
+                                        MINUTES_HTML_URL=""
+                                        MINUTES_PDF_URL=""
+                                        MINUTES_ATTACH_PDF_URL=""
 
-                                        # Direct video links is always "video.isilive.ca/<REGION>/<NAME>"
-                                        # There are some eScribe ones, but those are in m3u8s and are really annoying to work with
-
-                                        # ...not annoying as more sed though.
-                                        VIDEO_URL=$(printf "https://video.isilive.ca/london/"; grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g')
-
-                                        # Finalize everything
-                                        mkdir "./tmp/$MEETING_NAME/"
-                                        mkdir "./tmp/$MEETING_NAME/Attachments/"
-                                        wget --user-agent="$WGET_UA" $AGENDA_PDF_URL -O "./tmp/$MEETING_NAME/Agenda.pdf"
-                                        echo $VIDEO_URL >> "./tmp/$MEETING_NAME/RecordingLink.txt"
-                                        # Get attachment links
-                                        cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
-                                        # Get attachment names
-                                        cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.\([^'\''/]*\)'\''.*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_names"
-                                        # Download attachment and use the name grabbed above
+                                        echo "Found the following documents:"
                                         while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
-                                          wget --user-agent="$WGET_UA" https://pub-london.escribemeetings.com/$LINEA1 -O "./tmp/$MEETING_NAME/Attachments/$LINEA2"
-                                          echo "Here are the datas: https://pub-london.escribemeetings.com/$LINEA1/n$LINEA2"
-                                        done < ./tmp/attachment_urls 3< ./tmp/attachment_names
+                                            echo " - $LINEA2"
+
+                                            case "$LINEA2" in
+                                                "Agenda (HTML) ")
+                                                    AGENDA_HTML_URL="$LINEA1" ;;
+                                                "Agenda (PDF) ")
+                                                    AGENDA_PDF_URL="$LINEA1" ;;
+                                                "Revised Agenda (HTML) ")
+                                                    AGENDA_REVISE_HTML_URL="$LINEA1" ;;
+                                                "Revised Agenda (PDF) ")
+                                                    AGENDA_REVISE_PDF_URL="$LINEA1" ;;
+                                                "Minutes (HTML) ")
+                                                    MINUTES_HTML_URL="$LINEA1" ;;
+                                                "Minutes (PDF) ")
+                                                    MINUTES_PDF_URL="$LINEA1" ;;
+                                                "Minutes with Attachments (PDF) ")
+                                                    MINUTES_ATTACH_PDF_URL="$LINEA1" ;;
+                                            esac
+
+                                        done < ./tmp/meeting_urls 3< ./tmp/meeting_types
+
+                                        # Always prefer Revised Agendas
+                                        echo "Downloading agenda HTML..."
+                    if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
+                        wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
+                    elif [[ $AGENDA_HTML_URL != "" ]]; then
+                        wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress
+                    else
+                        ERROR="TRUE"
+                    fi
+
+                                        if [[ ERROR="FALSE" ]]; then
+                                                mkdir "./tmp/$MEETING_NAME/"
+                                                mkdir "./tmp/$MEETING_NAME/Attachments/"
+
+                                                # Direct video links is always "video.isilive.ca/<REGION>/<NAME>"
+                                                # There are some eScribe ones, but those are in m3u8s and are really annoying to work with
+
+                                                # ...not annoying as more sed though.
+                                                VIDEO_URL=$(grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g')
+
+                        if [[ $VIDEO_URL != "" ]]; then
+                            echo "Found meeting recording."
+                            echo "https://video.isilive.ca/london/"$VIDEO_URL > "./tmp/$MEETING_NAME/RecordingLink.txt"
+                        fi
+
+                                                # Get attachment links
+                                                cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
+                                                # Get attachment names
+                                                cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.\([^'\''/]*\)'\''.*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_names"
+                                                # Download attachment and use the name grabbed above
+                                                echo "Found the following agenda attachments:"
+                                                while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
+                                                  echo " - $LINEA2"
+                                                  wget --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "./tmp/$MEETING_NAME/Attachments/$LINEA2" -q #--show-progress
+                                                done < ./tmp/attachment_urls 3< ./tmp/attachment_names
+                                                echo "All attachments saved."
+
+                                                if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then
+                                    if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then
+                                        echo "Saving revised agenda..."
+                                        wget --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "./tmp/$MEETING_NAME/Agenda_Revised.pdf" -q #--show-progress
+                                    fi
+                                    if [[ $AGENDA_PDF_URL != "" ]]; then
+                                        echo "Saving regular agenda..."
+                                        wget --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "./tmp/$MEETING_NAME/Agenda.pdf" -q #--show-progress
+                                    fi
+                                else
+                                    if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
+                                        echo "Saving revised agenda as HTML (no PDF found!)"
+                                        wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "./tmp/$MEETING_NAME/Agenda_Revised.html" -q #--show-progress
+                                    fi
+                                    if [[ $AGENDA_HTML_URL != "" ]]; then
+                                        echo "Saving regular agenda as HTML (no PDF found!)"
+                                        wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "./tmp/$MEETING_NAME/Agenda.html" -q #--show-progress
+                                    fi
+                                fi
+                                
+                                                if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then
+                                    if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then
+                                        echo "Saving minutes with attachments..."
+                                        wget --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "./tmp/$MEETING_NAME/Minutes_With_Attachments.pdf" -q #--show-progress
+                                    fi
+                                    if [[ $MINUTES_PDF_URL != "" ]]; then
+                                        echo "Saving minutes..."
+                                        wget --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "./tmp/$MEETING_NAME/Minutes.pdf" -q #--show-progress
+                                    fi
+                                else
+                                    if [[ $MINUTES_HTML_URL != "" ]]; then
+                                        echo "Saving minutes as HTML (no PDF found!)"
+                                        wget --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "./tmp/$MEETING_NAME/Minutes.html" -q #--show-progress
+                                    fi
+                                fi
+
+                                        fi
                                 fi
 
                                 GREPMEETING=$(echo $LINE | grep "views-field-field-meeting-notes")
                                 if [[ "$GREPMEETING" != "" ]]; then
                                         FOUNDMEETING="TRUE"
                                         MEETING_NAME=$(echo $LINE | sed -n 's/.*<div class="meeting__date">\([^<]*\)<\/div>.*/\1/p')
-                                        echo $MEETING_NAME
                                 fi
                         done < $SEARCH_PAGE
                 else