#/bash # Hey folks, please do not run this script more than necessary. # Too many search requests will temporarily block searches for everyone, not just you. # I do not want to DDOS London. I just want to allow for personal backups. Cheers! echo -e "\n-========================================================================-" echo -e "-=- -=-" echo -e "-=- SCRAPE_MEETINGS.SH: Downloads committee videos and agendas -=-" echo -e "-=- -=-" echo -e "-=- https://gist.github.com/rvtr/1b471e5f5215c368fd78d9aba05f8dc2 -=-" echo -e "-=- Lillian Skinner (2025) -=-" echo -e "-=- -=-" echo -e "-========================================================================-" echo "Starting job: SCRAPE_MEETINGS: $(date)" # Warning to all who read this script: # It is badly written. I know it is bad, but I am tired okay, and sometimes sloppy just works. # London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" TEMP_DIR="./tmp/" SEARCH_PAGE="./tmp/index.html" AGENDA_HTML="./tmp/work.html" #VIDEO_TIMESTAMP_JSON="./tmp/time.json" if [ -d "$TEMP_DIR" ]; then rm -r $TEMP_DIR fi rm -f $SEARCH_PAGE rm -f $AGENDA_HTML mkdir $TEMP_DIR SEARCH_URL="https://london.ca/government/council-civic-administration/council-committee-meetings/meetings" # Need to confirm. When stacking params does the type need to be f[1]? SEARCH_FORMAT_COMMITTEE="f[1]=meeting_type%3A" SEARCH_FORMAT_DATE="f[0]=meeting_date%3A" SEARCH_FORMAT_QUERY="search=query&sort_by=field_meeting_date" # As far as I'm aware there are no meetings prior to 2011. current_year=$(date +%Y) current_month=$(date +%m) current_day=$(date +%d) i=$(date +%Y) x=$((i + 1)) echo $x SEARCH_END="FALSE" while (( i < x )); do j=0 SEARCH_END="FALSE" while [[ $SEARCH_END == "FALSE" ]]; do echo "SCRAPE_MEETINGS: Downloading search results... Page $j of $i" wget --user-agent="$WGET_UA" $SEARCH_URL"?$SEARCH_FORMAT_DATE$i&page=$j" -O $SEARCH_PAGE -q #--show-progress if [ $? -ne 8 ]; then FOUNDMEETING="FALSE" GREP404=$(cat $SEARCH_PAGE | grep "No results found.") if [[ "$GREP404" == "" ]]; then while IFS= read -r LINE; do # All meeting items in the search results are formatted like so: # - One line with the name # - Second line with all other info including links # # We can find the first line by the class "views-field-field-meeting-notes" # FOUNDMEETING=TRUE will show that the first line has been found, and so the next line read will be "confirmed" as line 2 of the meeting info # The first two links of every second line are (in order) the PDF and HTML agendas if [[ "TRUE" == $FOUNDMEETING ]]; then FOUNDMEETING="FALSE" echo "SCRAPE_MEETINGS: -========================================================================-" echo "SCRAPE_MEETINGS: Working on $MEETING_NAME ($MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY)" echo "SCRAPE_MEETINGS: All files to be saved as "$MEETING_NAME"/"$MEETING_YEAR"/"$MEETING_MONTH"-"$MEETING_DAY"/" echo "SCRAPE_MEETINGS: -========================================================================-" echo "SCRAPE_MEETINGS: Task starting on: $(date)" #echo "CANCEL NOW!!!" #sleep 5 # Grab meeting item links echo $LINE | sed 's/href=./\nhref="/g' | grep 'href="https' | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq > "./tmp/meeting_urls" # Grab meeting item types echo $LINE | sed 's/rel=.noreferrer.>/\nrel="noreferrer">/g' | grep 'rel="noreferrer">' | sed 's/.*rel="noreferrer">\([^<]*\)<.*/\1/p' | uniq > "./tmp/meeting_types" AGENDA_HTML_URL="" AGENDA_PDF_URL="" AGENDA_REVISE_HTML_URL="" AGENDA_REVISE_PDF_URL="" MINUTES_HTML_URL="" MINUTES_PDF_URL="" MINUTES_ATTACH_PDF_URL="" echo "SCRAPE_MEETINGS: Found the following documents:" while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do echo "SCRAPE_MEETINGS: - $LINEA2" case "$LINEA2" in "Agenda (HTML) ") AGENDA_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; "Agenda (PDF) ") AGENDA_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; "Revised Agenda (HTML) ") AGENDA_REVISE_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; "Revised Agenda (PDF) ") AGENDA_REVISE_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; "Minutes (HTML) ") MINUTES_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; "Minutes (PDF) ") MINUTES_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; "Minutes with Attachments (PDF) ") MINUTES_ATTACH_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; esac done < ./tmp/meeting_urls 3< ./tmp/meeting_types # Always prefer Revised Agendas echo "SCRAPE_MEETINGS: Downloading agenda HTML..." if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress elif [[ $AGENDA_HTML_URL != "" ]]; then wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress else ERROR="TRUE" fi if [[ ERROR="FALSE" ]]; then mkdir "./LondonArchive" mkdir "./LondonArchive/Meetings" if [ ! -d "./LondonArchive/Meetings/$MEETING_NAME" ]; then mkdir "./LondonArchive/Meetings/$MEETING_NAME/" fi if [ ! -d "./LondonArchive/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then mkdir "./LondonArchive/Meetings/$MEETING_NAME/$MEETING_YEAR/" fi MEETING_DIR=$(printf "./LondonArchive/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY") if [ ! -d "$MEETING_DIR" ]; then mkdir "$MEETING_DIR/" fi if [ ! -d "$MEETING_DIR/Attachments" ]; then mkdir "$MEETING_DIR/Attachments/" fi # Direct video links is always "video.isilive.ca//" # There are some eScribe ones, but those are in m3u8s and are really annoying to work with # ...not annoying as more sed though. VIDEO_URL=$(grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g') if [[ $VIDEO_URL != "" ]]; then echo "SCRAPE_MEETINGS: Saving recording URL..." echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt" fi # Get attachment links cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls" # Get attachment names cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names" # Download attachment and use the name grabbed above echo "SCRAPE_MEETINGS: Found the following agenda attachments:" while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do echo "SCRAPE_MEETINGS: - $LINEA2" wget --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -q #--show-progress done < ./tmp/attachment_urls 3< ./tmp/attachment_names echo "SCRAPE_MEETINGS: All attachments saved." if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then echo "SCRAPE_MEETINGS: Saving revised agenda as PDF..." wget --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$MEETING_DIR/Agenda_Revised.pdf" -q #--show-progress fi if [[ $AGENDA_PDF_URL != "" ]]; then echo "SCRAPE_MEETINGS: Saving regular agenda as PDF..." wget --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$MEETING_DIR/Agenda.pdf" -q #--show-progress fi else if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then echo "SCRAPE_MEETINGS: Saving revised agenda as HTML... (no PDF found!)" wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$MEETING_DIR/Agenda_Revised.html" -q #--show-progress fi if [[ $AGENDA_HTML_URL != "" ]]; then echo "SCRAPE_MEETINGS: Saving regular agenda as HTML... (no PDF found!)" wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$MEETING_DIR/Agenda.html" -q #--show-progress fi fi if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then echo "SCRAPE_MEETINGS: Saving minutes with attachments as PDF..." wget --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$MEETING_DIR/Minutes_With_Attachments.pdf" -q #--show-progress fi if [[ $MINUTES_PDF_URL != "" ]]; then echo "SCRAPE_MEETINGS: Saving minutes as PDF..." wget --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$MEETING_DIR/Minutes.pdf" -q #--show-progress fi else if [[ $MINUTES_HTML_URL != "" ]]; then echo "SCRAPE_MEETINGS: Saving minutes as HTML... (no PDF found!)" wget --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$MEETING_DIR/Minutes.html" -q #--show-progress fi fi fi echo "SCRAPE_MEETINGS: All files from this meeting have been saved." fi GREPMEETING=$(echo $LINE | grep "views-field-field-meeting-notes") if [[ "$GREPMEETING" != "" ]]; then MEETING_INFO=$(echo $LINE | sed -n 's/.*
\([^<]*\)<\/div>.*/\1/p') MEETING_MONTH_WORD=$(echo "$MEETING_INFO" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_DAY_SHORT=$(echo "$MEETING_INFO" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT) MEETING_YEAR=$(echo "$MEETING_INFO" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_NAME=$(echo "$MEETING_INFO" | sed -E 's/^[A-Za-z]+ [0-9]+, [0-9]+ - (.*)/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') case "$MEETING_MONTH_WORD" in January) MEETING_MONTH="01" ;; February) MEETING_MONTH="02" ;; March) MEETING_MONTH="03" ;; April) MEETING_MONTH="04" ;; May) MEETING_MONTH="05" ;; June) MEETING_MONTH="06" ;; July) MEETING_MONTH="07" ;; August) MEETING_MONTH="08" ;; September) MEETING_MONTH="09" ;; October) MEETING_MONTH="10" ;; November) MEETING_MONTH="11" ;; December) MEETING_MONTH="12" ;; *) MEETING_MONTH="--" ;; esac if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then #echo "date is greater" FOUNDMEETING="TRUE" else echo "SCRAPE_MEETINGS: Dates are in the past! Abort." SEARCH_END="TRUE" break fi fi done < $SEARCH_PAGE else SEARCH_END="TRUE" echo "SCRAPE_MEETINGS: No more pages!" fi else SEARCH_END="TRUE" echo "SCRAPE_MEETINGS: No more pages!" fi ((j++)) done ((i++)) done echo "Done job: SCRAPE_MEETINGS: $(date)"