185 lines
9.6 KiB
Bash
185 lines
9.6 KiB
Bash
#!/usr/bin/env bash
|
|
echo -e "\n-========================================================================-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- SCRAPE_LTC.SH: Downloads LTC committee agendas and minutes -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- Lillian Skinner -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-========================================================================-"
|
|
|
|
MEETINGS_PAGE="./tmp.html"
|
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
|
|
|
mkdir "./LondonArchive"
|
|
mkdir "./LondonArchive/LTC"
|
|
mkdir "./tmp"
|
|
|
|
wget --user-agent="$WGET_UA" "https://www.londontransit.ca/agendas-and-minutes/" -O "./tmp/index.html" -q #--show-progress
|
|
|
|
current_year=$(date +%Y)
|
|
current_month=$(date +%m)
|
|
current_day=$(date +%d)
|
|
# If I don't set these values then "10#: invalid integer constant"
|
|
MEETING_YEAR="0000"
|
|
MEETING_MONTH="00"
|
|
MEETING_DAY="00"
|
|
|
|
while IFS= read -r LINE_PRE; do
|
|
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
|
|
|
# Only <h2> without a class is the title of a committee. We'll confirm it is the title by checking for "Agendas and Minutes".
|
|
GREPCOMMITTEE=$(echo $LINE | grep "<h2>" | grep "Agendas and Minutes")
|
|
GREPDATE=$(echo $LINE | grep "</strong></td>")
|
|
if [[ "$GREPCOMMITTEE" != "" ]]; then
|
|
COMMITTEENAME=$(echo $LINE | sed 's/<h2>//' | sed 's/<\/h2>//')
|
|
echo "NEW COMMITTEE"
|
|
echo "$COMMITTEENAME"
|
|
if [[ "$(echo "$COMMITTEENAME" | grep "APTSAC")" != "" ]]; then
|
|
echo "Committee slug set"
|
|
COMMITTEENAME_SLUG="Accessible Public Transit Services Advisory Committee"
|
|
elif [[ "$(echo "$COMMITTEENAME" | grep "Commission")" != "" ]]; then
|
|
echo "Committee slug set"
|
|
COMMITTEENAME_SLUG="Commission"
|
|
fi
|
|
# Only a marker for a new committee. Do nothing else.
|
|
GREPCOMMITTEE=""
|
|
elif [[ "$GREPDATE" != "" ]]; then
|
|
# Remove HTML junk from date string.
|
|
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
|
|
MEETING_MONTH_WORD=$(echo "$DATES_CLEAN" | sed -E 's/^([A-Za-z]+) .*/\1/')
|
|
MEETING_DAY_SHORT=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/')
|
|
MEETING_DAY=$(printf "%02d" ${MEETING_DAY_SHORT#0})
|
|
MEETING_YEAR=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/')
|
|
|
|
case "$MEETING_MONTH_WORD" in
|
|
Jan*) MEETING_MONTH="01" ;;
|
|
Feb*) MEETING_MONTH="02" ;;
|
|
Mar*) MEETING_MONTH="03" ;;
|
|
Apr*) MEETING_MONTH="04" ;;
|
|
May) MEETING_MONTH="05" ;;
|
|
Jun*) MEETING_MONTH="06" ;;
|
|
Jul*) MEETING_MONTH="07" ;;
|
|
Aug*) MEETING_MONTH="08" ;;
|
|
Sep*) MEETING_MONTH="09" ;;
|
|
Oct*) MEETING_MONTH="10" ;;
|
|
Nov*) MEETING_MONTH="11" ;;
|
|
Dec*) MEETING_MONTH="12" ;;
|
|
*) MEETING_MONTH="--" ;;
|
|
esac
|
|
echo " NEW MEETING FOUND"
|
|
echo " DATE IS $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
|
GREPDATE=""
|
|
else
|
|
# Has a previous meeting has been set? What about a date?
|
|
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
|
|
if [[ "COMMITTEENAME" != "" ]] && [[ "MEETING_YEAR" != "" ]] && (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
|
|
# Not changing meetings, and we know that an old meeting has alread been set. Keep going.
|
|
|
|
# If match --> make folder --> download
|
|
AGENDAURL=$(echo $LINE | grep "PDF Agenda" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
|
|
AGENDAHTMLURL=$(echo $LINE | grep "HTML Agenda" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
|
MINUTESURL=$(echo $LINE | grep "PDF Minutes" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
|
|
MINUTESHTMLURL=$(echo $LINE | grep "HTML Minutes" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
|
# Yes, I know there are HTML agendas. LTC is a lower priority, so I am not able to put a bunch of time into making a parser for the HTML. Sorry.
|
|
# Well... this aged well.
|
|
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
|
|
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
|
|
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR" 2> /dev/null
|
|
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY" 2> /dev/null
|
|
if [[ "$AGENDAURL" != "" ]]; then
|
|
echo " DOWNLOAD AGENDA PDF"
|
|
echo " $AGENDAURL"
|
|
wget --user-agent="$WGET_UA" "$AGENDAURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Agenda.pdf" -c -q #--show-progress
|
|
elif [[ "$MINUTESURL" != "" ]]; then
|
|
echo " DOWNLOAD MINUTES PDF"
|
|
echo " $MINUTESURL"
|
|
wget --user-agent="$WGET_UA" "$MINUTESURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Minutes.pdf" -c -q #--show-progress
|
|
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
|
|
if [[ "$AGENDAHTMLURL" != "" ]]; then
|
|
echo " DOWNLOAD AGENDA HTML TO CRAWL"
|
|
echo " $AGENDAHTMLURL"
|
|
wget --user-agent="$WGET_UA" "$AGENDAHTMLURL" -O "./tmp/work.html" -q #--show-progress
|
|
elif [[ "$MINUTESHTMLURL" != "" ]]; then
|
|
echo " DOWNLOAD MINUTES HTML TO CRAWL"
|
|
echo " $MINUTESHTMLURL"
|
|
wget --user-agent="$WGET_UA" "$MINUTESHTMLURL" -O "./tmp/work.html" -q #--show-progress
|
|
fi
|
|
while IFS= read -r LINE_HTML_PRE; do
|
|
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
|
|
GREPARTICLESTART=$(echo $LINE_HTML | grep "<article")
|
|
GREPARTICLEEND=$(echo $LINE_HTML | grep "</article>")
|
|
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
|
if [[ "$GREPARTICLESTART" != "" ]]; then
|
|
echo " FOUND INDEX ARTICLE START"
|
|
ISARTICLE=1
|
|
elif [[ "$GREPARTICLEEND" != "" ]]; then
|
|
echo " END OF INDEX ARTICLE"
|
|
ISARTICLE=0
|
|
elif [[ "$GREPLINK" != "" ]] && (( ISARTICLE )); then
|
|
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
|
|
ISPDF=$(echo $GREPLINK | grep "\.pdf")
|
|
if [[ "$ISPDF" != "" ]]; then
|
|
PDFNAME=$(echo $ISPDF | sed 's/.*\///')
|
|
echo " DOWNLOAD ATTACHMENT PDF"
|
|
echo " $ISPDF"
|
|
wget --user-agent="$WGET_UA" "$ISPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFNAME" -c -q #--show-progress
|
|
else
|
|
# Extract title of attachment
|
|
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
|
|
echo " DOWNLOAD ATTACHMENT HTML"
|
|
echo " $ATTACHTITLE"
|
|
echo " $GREPLINK"
|
|
wget --user-agent="$WGET_UA" "$GREPLINK" -O "./tmp/attachment.html" -q #--show-progress
|
|
while IFS= read -r LINE_ATTACH_PRE; do
|
|
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
|
|
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
|
|
GREPATTACHMENTARTICLEEND=$(echo $LINE_ATTACH | grep "</article>")
|
|
GREPATTACHMENTLINK=$(echo $LINE_ATTACH | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
|
|
if [[ "$GREPATTACHMENTARTICLESTART" != "" ]]; then
|
|
echo " FOUND ATTACHMENT ARTICLE START"
|
|
# CSS for the HTML is in the default template
|
|
cat ./template/default.html > ./tmp/new.html
|
|
echo "$LINE_ATTACH" >> ./tmp/new.html
|
|
ISATTACHMENTARTICLE=1
|
|
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
|
|
echo " END OF ATTACHMENT ARTICLE"
|
|
echo "$LINE_ATTACH" >> ./tmp/new.html
|
|
echo " PROCESSED TO PDF"
|
|
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
|
|
ISATTACHMENTARTICLE=0
|
|
elif [[ "$GREPATTACHMENTLINK" != "" ]] && (( ISATTACHMENTARTICLE )); then
|
|
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
|
|
if [[ "$ISREFPDF" != "" ]]; then
|
|
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
|
|
echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
|
|
echo " $GREPATTACHMENTLINK"
|
|
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
|
|
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
|
|
fi
|
|
elif (( ISATTACHMENTARTICLE )); then
|
|
echo "$LINE_ATTACH" >> ./tmp/new.html
|
|
fi
|
|
LINE_ATTACH=""
|
|
GREPATTACHMENTARTICLESTART=""
|
|
GREPATTACHMENTARTICLEEND=""
|
|
GREPATTACHMENTLINK=""
|
|
done < ./tmp/attachment.html
|
|
ISPDF=""
|
|
fi
|
|
fi
|
|
LINE_HTML=""
|
|
GREPARTICLESTART=""
|
|
GREPARTICLEEND=""
|
|
GREPLINK=""
|
|
done < ./tmp/work.html
|
|
fi
|
|
AGENDAURL=""
|
|
AGENDAHTMLURL=""
|
|
MINUTESURL=""
|
|
MINUTESHTMLURL=""
|
|
fi
|
|
fi
|
|
fi
|
|
done < "./tmp/index.html"
|