LondonScrapers/SCRAPE_LTC.SH

185 lines
9.7 KiB
Bash

#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_LTC.SH: Downloads LTC committee agendas and minutes -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
MEETINGS_PAGE="./tmp.html"
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
mkdir "./LondonArchive"
mkdir "./LondonArchive/LTC"
mkdir "./tmp"
wget --user-agent="$WGET_UA" "https://www.londontransit.ca/agendas-and-minutes/" -O "./tmp/index.html" -q #--show-progress
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
# If I don't set these values then "10#: invalid integer constant"
MEETING_YEAR="0000"
MEETING_MONTH="00"
MEETING_DAY="00"
while IFS= read -r LINE_PRE; do
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
# Only <h2> without a class is the title of a committee. We'll confirm it is the title by checking for "Agendas and Minutes".
GREPCOMMITTEE=$(echo $LINE | grep "<h2>" | grep "Agendas and Minutes")
GREPDATE=$(echo $LINE | grep "</strong></td>")
if [[ "$GREPCOMMITTEE" != "" ]]; then
COMMITTEENAME=$(echo $LINE | sed 's/<h2>//' | sed 's/<\/h2>//')
echo "NEW COMMITTEE"
echo "$COMMITTEENAME"
if [[ "$(echo "$COMMITTEENAME" | grep "APTSAC")" != "" ]]; then
echo "Committee slug set"
COMMITTEENAME_SLUG="Accessible Public Transit Services Advisory Committee"
elif [[ "$(echo "$COMMITTEENAME" | grep "Commission")" != "" ]]; then
echo "Committee slug set"
COMMITTEENAME_SLUG="Commission"
fi
# Only a marker for a new committee. Do nothing else.
GREPCOMMITTEE=""
elif [[ "$GREPDATE" != "" ]]; then
# Remove HTML junk from date string.
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
MEETING_MONTH_WORD=$(echo "$DATES_CLEAN" | sed -E 's/^([A-Za-z]+) .*/\1/')
MEETING_DAY_SHORT=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/')
MEETING_DAY=$(printf "%02d" ${MEETING_DAY_SHORT#0})
MEETING_YEAR=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
echo " NEW MEETING FOUND"
echo " DATE IS $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
GREPDATE=""
else
# Has a previous meeting has been set? What about a date?
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
if [[ "COMMITTEENAME" != "" ]] && [[ "MEETING_YEAR" != "" ]] && (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
# Not changing meetings, and we know that an old meeting has alread been set. Keep going.
# If match --> make folder --> download
AGENDAURL=$(echo $LINE | grep "PDF Agenda" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
AGENDAHTMLURL=$(echo $LINE | grep "HTML Agenda" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
MINUTESURL=$(echo $LINE | grep "PDF Minutes" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
MINUTESHTMLURL=$(echo $LINE | grep "HTML Minutes" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
# Yes, I know there are HTML agendas. LTC is a lower priority, so I am not able to put a bunch of time into making a parser for the HTML. Sorry.
# Well... this aged well.
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY" 2> /dev/null
if [[ "$AGENDAURL" != "" ]]; then
echo " DOWNLOAD AGENDA PDF"
echo " $AGENDAURL"
wget --user-agent="$WGET_UA" "$AGENDAURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Agenda.pdf" -c -q #--show-progress
elif [[ "$MINUTESURL" != "" ]]; then
echo " DOWNLOAD MINUTES PDF"
echo " $MINUTESURL"
wget --user-agent="$WGET_UA" "$MINUTESURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Minutes.pdf" -c -q #--show-progress
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
if [[ "$AGENDAHTMLURL" != "" ]]; then
echo " DOWNLOAD AGENDA HTML TO CRAWL"
echo " $AGENDAHTMLURL"
wget --user-agent="$WGET_UA" "$AGENDAHTMLURL" -O "./tmp/work.html" -q #--show-progress
elif [[ "$MINUTESHTMLURL" != "" ]]; then
echo " DOWNLOAD MINUTES HTML TO CRAWL"
echo " $MINUTESHTMLURL"
wget --user-agent="$WGET_UA" "$MINUTESHTMLURL" -O "./tmp/work.html" -q #--show-progress
fi
while IFS= read -r LINE_HTML_PRE; do
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
GREPARTICLESTART=$(echo $LINE_HTML | grep "<article")
GREPARTICLEEND=$(echo $LINE_HTML | grep "</article>")
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
if [[ "$GREPARTICLESTART" != "" ]]; then
echo " FOUND INDEX ARTICLE START"
ISARTICLE="TRUE"
elif [[ "$GREPARTICLEEND" != "" ]]; then
echo " END OF INDEX ARTICLE"
ISARTICLE=""
elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
ISPDF=$(echo $GREPLINK | grep "\.pdf")
if [[ "$ISPDF" != "" ]]; then
PDFNAME=$(echo $ISPDF | sed 's/.*\///')
echo " DOWNLOAD ATTACHMENT PDF"
echo " $ISPDF"
wget --user-agent="$WGET_UA" "$ISPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFNAME" -c -q #--show-progress
else
# Extract title of attachment
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&amp;/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
echo " DOWNLOAD ATTACHMENT HTML"
echo " $ATTACHTITLE"
echo " $GREPLINK"
wget --user-agent="$WGET_UA" "$GREPLINK" -O "./tmp/attachment.html" -q #--show-progress
while IFS= read -r LINE_ATTACH_PRE; do
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
GREPATTACHMENTARTICLEEND=$(echo $LINE_ATTACH | grep "</article>")
GREPATTACHMENTLINK=$(echo $LINE_ATTACH | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
if [[ "$GREPATTACHMENTARTICLESTART" != "" ]]; then
echo " FOUND ATTACHMENT ARTICLE START"
# CSS for the HTML is in the default template
cat ./template/default.html > ./tmp/new.html
echo "$LINE_ATTACH" >> ./tmp/new.html
ISATTACHMENTARTICLE="TRUE"
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
echo " END OF ATTACHMENT ARTICLE"
echo "$LINE_ATTACH" >> ./tmp/new.html
echo " PROCESSED TO PDF"
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
ISATTACHMENTARTICLE=""
elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
if [[ "$ISREFPDF" != "" ]]; then
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
echo " $GREPATTACHMENTLINK"
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
fi
elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then
echo "$LINE_ATTACH" >> ./tmp/new.html
fi
LINE_ATTACH=""
GREPATTACHMENTARTICLESTART=""
GREPATTACHMENTARTICLEEND=""
GREPATTACHMENTLINK=""
done < ./tmp/attachment.html
ISPDF=""
fi
fi
LINE_HTML=""
GREPARTICLESTART=""
GREPARTICLEEND=""
GREPLINK=""
done < ./tmp/work.html
fi
AGENDAURL=""
AGENDAHTMLURL=""
MINUTESURL=""
MINUTESHTMLURL=""
fi
fi
fi
done < "./tmp/index.html"