#!/usr/bin/env bash echo -e "\n-========================================================================-" echo -e "-=- -=-" echo -e "-=- SCRAPE_LTC.SH: Downloads LTC committee agendas and minutes -=-" echo -e "-=- -=-" echo -e "-=- Lillian Skinner -=-" echo -e "-=- -=-" echo -e "-========================================================================-" MEETINGS_PAGE="./tmp.html" # London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" mkdir "./LondonArchive" mkdir "./LondonArchive/LTC" mkdir "./tmp" wget --user-agent="$WGET_UA" "https://www.londontransit.ca/agendas-and-minutes/" -O "./tmp/index.html" -q #--show-progress current_year=$(date +%Y) current_month=$(date +%m) current_day=$(date +%d) # If I don't set these values then "10#: invalid integer constant" MEETING_YEAR="0000" MEETING_MONTH="00" MEETING_DAY="00" while IFS= read -r LINE_PRE; do LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /') # Only

without a class is the title of a committee. We'll confirm it is the title by checking for "Agendas and Minutes". GREPCOMMITTEE=$(echo $LINE | grep "

" | grep "Agendas and Minutes") GREPDATE=$(echo $LINE | grep "") if [[ "$GREPCOMMITTEE" != "" ]]; then COMMITTEENAME=$(echo $LINE | sed 's/

//' | sed 's/<\/h2>//') echo "NEW COMMITTEE" echo "$COMMITTEENAME" if [[ "$(echo "$COMMITTEENAME" | grep "APTSAC")" != "" ]]; then echo "Committee slug set" COMMITTEENAME_SLUG="Accessible Public Transit Services Advisory Committee" elif [[ "$(echo "$COMMITTEENAME" | grep "Commission")" != "" ]]; then echo "Committee slug set" COMMITTEENAME_SLUG="Commission" fi # Only a marker for a new committee. Do nothing else. GREPCOMMITTEE="" elif [[ "$GREPDATE" != "" ]]; then # Remove HTML junk from date string. DATES_CLEAN=$(echo $GREPDATE | sed 's/.*//' | sed 's/<\/strong>.*//' | sed 's/= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then # Not changing meetings, and we know that an old meeting has alread been set. Keep going. # If match --> make folder --> download AGENDAURL=$(echo $LINE | grep "PDF Agenda" | grep "\.pdf" | sed 's/.*.*/\1/' | sed 's/.pdf.*/.pdf/') AGENDAHTMLURL=$(echo $LINE | grep "HTML Agenda" | sed 's/.*.*/\1/' | sed 's/".*//') MINUTESURL=$(echo $LINE | grep "PDF Minutes" | grep "\.pdf" | sed 's/.*.*/\1/' | sed 's/.pdf.*/.pdf/') MINUTESHTMLURL=$(echo $LINE | grep "HTML Minutes" | sed 's/.*.*/\1/' | sed 's/".*//') # Yes, I know there are HTML agendas. LTC is a lower priority, so I am not able to put a bunch of time into making a parser for the HTML. Sorry. # Well... this aged well. if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR" 2> /dev/null mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY" 2> /dev/null if [[ "$AGENDAURL" != "" ]]; then echo " DOWNLOAD AGENDA PDF" echo " $AGENDAURL" wget --user-agent="$WGET_UA" "$AGENDAURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Agenda.pdf" -c -q #--show-progress elif [[ "$MINUTESURL" != "" ]]; then echo " DOWNLOAD MINUTES PDF" echo " $MINUTESURL" wget --user-agent="$WGET_UA" "$MINUTESURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Minutes.pdf" -c -q #--show-progress elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then if [[ "$AGENDAHTMLURL" != "" ]]; then echo " DOWNLOAD AGENDA HTML TO CRAWL" echo " $AGENDAHTMLURL" wget --user-agent="$WGET_UA" "$AGENDAHTMLURL" -O "./tmp/work.html" -q #--show-progress elif [[ "$MINUTESHTMLURL" != "" ]]; then echo " DOWNLOAD MINUTES HTML TO CRAWL" echo " $MINUTESHTMLURL" wget --user-agent="$WGET_UA" "$MINUTESHTMLURL" -O "./tmp/work.html" -q #--show-progress fi while IFS= read -r LINE_HTML_PRE; do LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /') GREPARTICLESTART=$(echo $LINE_HTML | grep "") GREPLINK=$(echo $LINE_HTML | grep ".*/\1/' | sed 's/".*//') if [[ "$GREPARTICLESTART" != "" ]]; then echo " FOUND INDEX ARTICLE START" ISARTICLE="TRUE" elif [[ "$GREPARTICLEEND" != "" ]]; then echo " END OF INDEX ARTICLE" ISARTICLE="" elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null ISPDF=$(echo $GREPLINK | grep "\.pdf") if [[ "$ISPDF" != "" ]]; then PDFNAME=$(echo $ISPDF | sed 's/.*\///') echo " DOWNLOAD ATTACHMENT PDF" echo " $ISPDF" wget --user-agent="$WGET_UA" "$ISPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFNAME" -c -q #--show-progress else # Extract title of attachment ATTACHTITLE=$(echo $LINE_HTML | sed 's///g' | sed 's/<\/sup>//g' | sed -n 's/.*\([^<]*\)<\/a>.*/\1/p' | sed 's/&/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g') echo " DOWNLOAD ATTACHMENT HTML" echo " $ATTACHTITLE" echo " $GREPLINK" wget --user-agent="$WGET_UA" "$GREPLINK" -O "./tmp/attachment.html" -q #--show-progress while IFS= read -r LINE_ATTACH_PRE; do LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /') GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "") GREPATTACHMENTLINK=$(echo $LINE_ATTACH | grep "\.pdf" | sed 's/.*.*/\1/' | sed 's/.pdf.*/.pdf/') if [[ "$GREPATTACHMENTARTICLESTART" != "" ]]; then echo " FOUND ATTACHMENT ARTICLE START" # CSS for the HTML is in the default template cat ./template/default.html > ./tmp/new.html echo "$LINE_ATTACH" >> ./tmp/new.html ISATTACHMENTARTICLE="TRUE" elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then echo " END OF ATTACHMENT ARTICLE" echo "$LINE_ATTACH" >> ./tmp/new.html echo " PROCESSED TO PDF" wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null ISATTACHMENTARTICLE="" elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf") if [[ "$ISREFPDF" != "" ]]; then PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///') echo " DOWNLOAD REFERENCED ATTACHMENT PDF" echo " $GREPATTACHMENTLINK" wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress echo "
  • $PDFREFNAME
" >> ./tmp/new.html fi elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then echo "$LINE_ATTACH" >> ./tmp/new.html fi LINE_ATTACH="" GREPATTACHMENTARTICLESTART="" GREPATTACHMENTARTICLEEND="" GREPATTACHMENTLINK="" done < ./tmp/attachment.html ISPDF="" fi fi LINE_HTML="" GREPARTICLESTART="" GREPARTICLEEND="" GREPLINK="" done < ./tmp/work.html fi AGENDAURL="" AGENDAHTMLURL="" MINUTESURL="" MINUTESHTMLURL="" fi fi fi done < "./tmp/index.html"