LondonScrapers/SCRAPE_LTC.SH
2026-06-19 23:30:51 -04:00

170 lines
8.7 KiB
Bash
Executable File

#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_LTC.SH: Downloads LTC committee agendas and minutes -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
MEETINGS_PAGE="./tmp.html"
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
mkdir "./LondonArchive"
mkdir "./LondonArchive/LTC"
mkdir "./tmp"
wget --user-agent="$WGET_UA" "https://www.londontransit.ca/agendas-and-minutes/" -O "./tmp/index.html" -q #--show-progress
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
# If I don't set these values then "10#: invalid integer constant"
ITEM_YEAR="0000"
ITEM_MONTH="00"
ITEM_DAY="00"
while IFS= read -r LINE_PRE; do
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
# Only <h2> without a class is the title of a committee. We'll confirm it is the title by checking for "Agendas and Minutes".
GREPCOMMITTEE=$(echo $LINE | grep "<h2>" | grep "Agendas and Minutes")
GREPDATE=$(echo $LINE | grep "</strong></td>")
if [[ "$GREPCOMMITTEE" != "" ]]; then
COMMITTEENAME=$(echo $LINE | sed 's/<h2>//' | sed 's/<\/h2>//')
echo "NEW COMMITTEE"
echo "$COMMITTEENAME"
if [[ "$(echo "$COMMITTEENAME" | grep "APTSAC")" != "" ]]; then
echo "Committee slug set"
COMMITTEENAME_SLUG="Accessible Public Transit Services Advisory Committee"
elif [[ "$(echo "$COMMITTEENAME" | grep "Commission")" != "" ]]; then
echo "Committee slug set"
COMMITTEENAME_SLUG="Commission"
fi
# Only a marker for a new committee. Do nothing else.
GREPCOMMITTEE=""
elif [[ "$GREPDATE" != "" ]]; then
# Remove HTML junk from date string.
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
_time_parse_helper "$DATES_CLEAN"
echo " NEW MEETING FOUND"
echo " DATE IS $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
GREPDATE=""
else
# Has a previous meeting has been set? What about a date?
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
if [[ "COMMITTEENAME" != "" ]] && [[ "ITEM_YEAR" != "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
# Not changing meetings, and we know that an old meeting has alread been set. Keep going.
# If match --> make folder --> download
AGENDAURL=$(echo $LINE | grep "PDF Agenda" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
AGENDAHTMLURL=$(echo $LINE | grep "HTML Agenda" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
MINUTESURL=$(echo $LINE | grep "PDF Minutes" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
MINUTESHTMLURL=$(echo $LINE | grep "HTML Minutes" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
# Yes, I know there are HTML agendas. LTC is a lower priority, so I am not able to put a bunch of time into making a parser for the HTML. Sorry.
# Well... this aged well.
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY" 2> /dev/null
if [[ "$AGENDAURL" != "" ]]; then
echo " DOWNLOAD AGENDA PDF"
echo " $AGENDAURL"
_utils_download_helper "$AGENDAURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Agenda.pdf"
elif [[ "$MINUTESURL" != "" ]]; then
echo " DOWNLOAD MINUTES PDF"
echo " $MINUTESURL"
_utils_download_helper "$MINUTESURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Minutes.pdf"
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
if [[ "$AGENDAHTMLURL" != "" ]]; then
echo " DOWNLOAD AGENDA HTML TO CRAWL"
echo " $AGENDAHTMLURL"
_utils_download_helper "$AGENDAHTMLURL" "./tmp/work.html"
elif [[ "$MINUTESHTMLURL" != "" ]]; then
echo " DOWNLOAD MINUTES HTML TO CRAWL"
echo " $MINUTESHTMLURL"
_utils_download_helper "$MINUTESHTMLURL" "./tmp/work.html"
fi
while IFS= read -r LINE_HTML_PRE; do
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
GREPARTICLESTART=$(echo $LINE_HTML | grep "<article")
GREPARTICLEEND=$(echo $LINE_HTML | grep "</article>")
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
if [[ "$GREPARTICLESTART" != "" ]]; then
echo " FOUND INDEX ARTICLE START"
ISARTICLE=1
elif [[ "$GREPARTICLEEND" != "" ]]; then
echo " END OF INDEX ARTICLE"
ISARTICLE=0
elif [[ "$GREPLINK" != "" ]] && (( ISARTICLE )); then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments" 2> /dev/null
ISPDF=$(echo $GREPLINK | grep "\.pdf")
if [[ "$ISPDF" != "" ]]; then
PDFNAME=$(echo $ISPDF | sed 's/.*\///')
echo " DOWNLOAD ATTACHMENT PDF"
echo " $ISPDF"
_utils_download_helper "$ISPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFNAME"
else
# Extract title of attachment
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&amp;/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
echo " DOWNLOAD ATTACHMENT HTML"
echo " $ATTACHTITLE"
echo " $GREPLINK"
_utils_download_helper "$GREPLINK" "./tmp/attachment.html"
while IFS= read -r LINE_ATTACH_PRE; do
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
GREPATTACHMENTARTICLEEND=$(echo $LINE_ATTACH | grep "</article>")
GREPATTACHMENTLINK=$(echo $LINE_ATTACH | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
if [[ "$GREPATTACHMENTARTICLESTART" != "" ]]; then
echo " FOUND ATTACHMENT ARTICLE START"
# CSS for the HTML is in the default template
cat ./template/default.html > ./tmp/new.html
echo "$LINE_ATTACH" >> ./tmp/new.html
ISATTACHMENTARTICLE=1
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
echo " END OF ATTACHMENT ARTICLE"
echo "$LINE_ATTACH" >> ./tmp/new.html
echo " PROCESSED TO PDF"
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
ISATTACHMENTARTICLE=0
elif [[ "$GREPATTACHMENTLINK" != "" ]] && (( ISATTACHMENTARTICLE )); then
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
if [[ "$ISREFPDF" != "" ]]; then
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
echo " $GREPATTACHMENTLINK"
_utils_download_helper "$ISREFPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFREFNAME"
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
fi
elif (( ISATTACHMENTARTICLE )); then
echo "$LINE_ATTACH" >> ./tmp/new.html
fi
LINE_ATTACH=""
GREPATTACHMENTARTICLESTART=""
GREPATTACHMENTARTICLEEND=""
GREPATTACHMENTLINK=""
done < ./tmp/attachment.html
ISPDF=""
fi
fi
LINE_HTML=""
GREPARTICLESTART=""
GREPARTICLEEND=""
GREPLINK=""
done < ./tmp/work.html
fi
AGENDAURL=""
AGENDAHTMLURL=""
MINUTESURL=""
MINUTESHTMLURL=""
fi
fi
fi
done < "./tmp/index.html"