2026-01-26 13:42:45 -05:00 · 2025-05-16 04:44:00 -04:00 · 2025-05-16 04:44:00 -04:00 · e6058497d3
commit e6058497d3
1 changed files with 107 additions and 0 deletions
--- a/SCRAPE_LONDON.SH
+++ b/SCRAPE_LONDON.SH
@ -0,0 +1,107 @@
+#/bash
+echo -e "\n-========================================================================-"
+echo -e "-=-                                                                    -=-"
+echo -e "-=-      SCRAPE_LONDON.SH: Downloads committee videos and agendas      -=-"
+echo -e "-=-                                                                    -=-"
+echo -e "-========================================================================-"
+
+# Warning to all who read this script:
+# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
+
+# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
+WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
+
+TEMP_DIR="./tmp/"
+SEARCH_PAGE="./tmp/index.html"
+AGENDA_DIR="./Agenda/"
+AGENDA_HTML="./tmp/work.html"
+VIDEO_DIR="./Video/"
+#VIDEO_TIMESTAMP_JSON="./tmp/time.json"
+
+if [ -d "$TEMP_DIR" ]; then
+  rm -r $TEMP_DIR
+fi
+if [ -d "$AGENDA_DIR" ]; then
+  rm -r $AGENDA_DIR
+fi
+if [ -d "$VIDEO_DIR" ]; then
+  rm -r $VIDEO_DIR
+fi
+mkdir $TEMP_DIR
+mkdir $AGENDA_DIR
+mkdir $VIDEO_DIR
+
+SEARCH_URL="https://london.ca/government/council-civic-administration/council-committee-meetings/meetings"
+# Need to confirm. When stacking params does the date need to be ?f[1]?
+SEARCH_PARAM_COMMITTEE="f[0]=meeting_type%3A"
+SEARCH_PARAM_DATE="f[0]=meeting_date%3A"
+SEARCH_PARAM_QUERY="search=query&sort_by=field_meeting_date"
+
+i=0
+SEARCH_END="FALSE"
+while [[ $SEARCH_END == "FALSE" ]]; do
+        wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$i" -O $SEARCH_PAGE
+        if [ $? -ne 8 ]; then
+                FOUNDMEETING="FALSE"
+
+                GREP404=$(cat $SEARCH_PAGE | grep "No results found.")
+                if [[ "$GREP404" == "" ]]; then
+                        while IFS= read -r LINE; do
+
+                                # All meeting items in the search results are formatted like so:
+                                # - One line with the name
+                                # - Second line with all other info including links
+                                #
+                                # We can find the first line by the class "views-field-field-meeting-notes"
+                                # FOUNDMEETING=TRUE will show that the first line has been found, and so the next line read will be "confirmed" as line 2 of the meeting info
+                                # The first two links of every second line are (in order) the PDF and HTML agendas
+
+                                if [[ "TRUE" == $FOUNDMEETING ]]; then
+                                        FOUNDMEETING="FALSE"
+
+                                        AGENDA_URLS=$(echo $LINE | sed 's/<a href="/\n<a href="/g' | grep '<a href="' | sed -n 's/.*<a href="\([^"]*\)".*/\1/p')
+                                        AGENDA_PDF_URL=$(echo "$AGENDA_URLS" | sed -n '1p')
+                                        AGENDA_HTML_URL=$(echo "$AGENDA_URLS" | sed -n '2p')
+                                        echo $AGENDA_URLS
+
+                                        echo "Working on: "$AGENDA_HTML_URL"/n"$AGENDA_PDF_URL
+                                        # sleep 1 # London please don't block me!
+                                        wget --user-agent="$WGET_UA" $AGENDA_HTML_URL -O $AGENDA_HTML
+
+                                        # Direct video links is always "video.isilive.ca/<REGION>/<NAME>"
+                                        # There are some eScribe ones, but those are in m3u8s and are really annoying to work with
+
+                                        # ...not annoying as more sed though.
+                                        VIDEO_URL=$(printf "https://video.isilive.ca/london/"; grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g')
+
+                                        # Finalize everything
+                                        mkdir "./tmp/$MEETING_NAME/"
+                                        mkdir "./tmp/$MEETING_NAME/Attachments/"
+                                        wget --user-agent="$WGET_UA" $AGENDA_PDF_URL -O "./tmp/$MEETING_NAME/Agenda.pdf"
+                                        echo $VIDEO_URL >> "./tmp/$MEETING_NAME/RecordingLink.txt"
+                                        # Get attachment links
+                                        cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
+                                        # Get attachment names
+                                        cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.\([^'\''/]*\)'\''.*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_names"
+                                        # Download attachment and use the name grabbed above
+                                        while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
+                                          wget --user-agent="$WGET_UA" https://pub-london.escribemeetings.com/$LINEA1 -O "./tmp/$MEETING_NAME/Attachments/$LINEA2"
+                                          echo "Here are the datas: https://pub-london.escribemeetings.com/$LINEA1/n$LINEA2"
+                                        done < ./tmp/attachment_urls 3< ./tmp/attachment_names
+                                fi
+
+                                GREPMEETING=$(echo $LINE | grep "views-field-field-meeting-notes")
+                                if [[ "$GREPMEETING" != "" ]]; then
+                                        FOUNDMEETING="TRUE"
+                                        MEETING_NAME=$(echo $LINE | sed -n 's/.*<div class="meeting__date">\([^<]*\)<\/div>.*/\1/p')
+                                        echo $MEETING_NAME
+                                fi
+                        done < $SEARCH_PAGE
+                else
+                        SEARCH_END="TRUE"
+                fi
+        else
+            SEARCH_END="TRUE"
+        fi
+        ((i++))
+done