#!/usr/bin/env bash echo -e "\n-========================================================================-" echo -e "-=- -=-" echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-" echo -e "-=- -=-" echo -e "-=- Lillian Skinner -=-" echo -e "-=- -=-" echo -e "-========================================================================-" conv_date() { echo "$1" MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT) MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') case "$MEETING_MONTH_WORD" in Jan*) MEETING_MONTH="01" ;; Feb*) MEETING_MONTH="02" ;; Mar*) MEETING_MONTH="03" ;; Apr*) MEETING_MONTH="04" ;; May) MEETING_MONTH="05" ;; Jun*) MEETING_MONTH="06" ;; Jul*) MEETING_MONTH="07" ;; Aug*) MEETING_MONTH="08" ;; Sep*) MEETING_MONTH="09" ;; Oct*) MEETING_MONTH="10" ;; Nov*) MEETING_MONTH="11" ;; Dec*) MEETING_MONTH="12" ;; *) MEETING_MONTH="--" ;; esac } conv_date_alt() { echo "$1" MEETING_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//') MEETING_DAY_SHORT=$(echo "$1" | sed 's/ .*//') MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT) MEETING_YEAR=$(echo "$1" | sed 's/.* //') case "$MEETING_MONTH_WORD" in Jan*) MEETING_MONTH="01" ;; Feb*) MEETING_MONTH="02" ;; Mar*) MEETING_MONTH="03" ;; Apr*) MEETING_MONTH="04" ;; May) MEETING_MONTH="05" ;; Jun*) MEETING_MONTH="06" ;; Jul*) MEETING_MONTH="07" ;; Aug*) MEETING_MONTH="08" ;; Sep*) MEETING_MONTH="09" ;; Oct*) MEETING_MONTH="10" ;; Nov*) MEETING_MONTH="11" ;; Dec*) MEETING_MONTH="12" ;; *) MEETING_MONTH="--" ;; esac } set_agenda_url() { case "$1" in '"Agenda (HTML)"') AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;; '"Agenda (PDF)"') AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;; '"Revised Agenda (HTML)"') AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;; '"Revised Agenda (PDF)"') AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;; '"Minutes (HTML)"') MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;; '"Minutes (PDF)"') MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;; '"Minutes with Attachments (PDF)"') MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;; '"Agenda Full Package (HTML)"') AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;; '"Agenda Full Package (PDF)"') AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;; '"Agenda Cover Page (HTML)"') AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;; '"Agenda Cover Page (PDF)"') AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;; '"Post Agenda (HTML)"') AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;; '"Post Agenda (PDF)"') AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;; '"Addendum (HTML)"') ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;; '"Addendum (PDF)"') ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;; esac } clear_agenda_url() { AGENDA_HTML_URL="" AGENDA_PDF_URL="" AGENDA_REVISE_HTML_URL="" AGENDA_REVISE_PDF_URL="" MINUTES_HTML_URL="" MINUTES_PDF_URL="" MINUTES_ATTACH_PDF_URL="" AGENDA_FULL_HTML_URL="" AGENDA_FULL_PDF_URL="" AGENDA_COVER_HTML_URL="" AGENDA_COVER_PDF_URL="" AGENDA_POST_HTML_URL="" AGENDA_POST_PDF_URL="" ADDENDUM_HTML_URL="" ADDENDUM_PDF_URL="" } download_agendas() { if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then echo "Saving revised agenda as PDF..." wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$1/Agenda_Revised.pdf" -N -q #--show-progress fi if [[ $AGENDA_PDF_URL != "" ]]; then echo "Saving regular agenda as PDF..." wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$1/Agenda.pdf" -N -q #--show-progress fi elif [[ $AGENDA_REVISE_HTML_URL != "" ]] || [[ $AGENDA_HTML_URL != "" ]]; then if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then echo "Saving revised agenda as HTML... (no PDF found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$1/Agenda_Revised.html" -N -q #--show-progress fi if [[ $AGENDA_HTML_URL != "" ]]; then echo "Saving regular agenda as HTML... (no PDF found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$1/Agenda.html" -N -q #--show-progress fi elif [[ $AGENDA_FULL_PDF_URL != "" ]] || [[ $AGENDA_FULL_HTML_URL != "" ]]; then if [[ $AGENDA_FULL_PDF_URL != "" ]]; then echo "Saving full package agenda as PDF... (no HTML found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_PDF_URL" -O "$1/Agenda_FullPackage.pdf" -N -q #--show-progress fi if [[ $AGENDA_FULL_HTML_URL != "" ]]; then echo "Saving full package agenda as HTML... (no PDF found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O "$1/Agenda_FullPackage.html" -N -q #--show-progress fi elif [[ $AGENDA_POST_PDF_URL != "" ]] || [[ $AGENDA_POST_HTML_URL != "" ]]; then if [[ $AGENDA_POST_PDF_URL != "" ]]; then echo "Saving post agenda as HTML... (no HTML found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_PDF_URL" -O "$1/Agenda_Post.pdf" -N -q #--show-progress fi if [[ $AGENDA_POST_HTML_URL != "" ]]; then echo "Saving post agenda as HTML... (no PDF found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O "$1/Agenda_Post.html" -N -q #--show-progress fi fi if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then echo "Saving minutes with attachments as PDF..." wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$1/Minutes_With_Attachments.pdf" -N -q #--show-progress fi if [[ $MINUTES_PDF_URL != "" ]]; then echo "Saving minutes as PDF..." wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$1/Minutes.pdf" -N -q #--show-progress fi else if [[ $MINUTES_HTML_URL != "" ]]; then echo "Saving minutes as HTML... (no PDF found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$1/Minutes.html" -N -q #--show-progress fi fi if [[ $AGENDA_COVER_PDF_URL != "" ]]; then echo "Saving cover agenda as PDF... (no HTML found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_PDF_URL" -O "$1/Agenda_Cover.pdf" -N -q #--show-progress fi if [[ $AGENDA_COVER_HTML_URL != "" ]]; then echo "Saving cover agenda as HTML... (no PDF found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O "$1/Agenda_Cover.html" -N -q #--show-progress fi if [[ $ADDENDUM_PDF_URL != "" ]]; then echo "Saving addendum as PDF... (no HTML found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_PDF_URL" -O "$1/Addendum.pdf" -N -q #--show-progress fi if [[ $ADDENDUM_HTML_URL != "" ]]; then echo "Saving addendum as HTML... (no PDF found!)" wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O "$1/Addendum.html" -N -q #--show-progress fi } # Warning to all who read this script: # It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works. # London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" TEMP_DIR="./tmp/" INDEX_PAGE="./tmp/index.html" SEARCH_PAGE="./tmp/search.html" AGENDA_HTML="./tmp/work.html" ADDENDUM_HTML="./tmp/addendum.html" #VIDEO_TIMESTAMP_JSON="./tmp/time.json" current_year=$(date +%Y) current_month=$(date +%m) current_day=$(date +%d)00 SUPPORT_PAST=1 if [ -d "$TEMP_DIR" ]; then rm -r $TEMP_DIR fi rm -f $INDEX_PAGE rm -f $SEARCH_PAGE rm -f $AGENDA_HTML mkdir $TEMP_DIR while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g') CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') INDEX_END=0 while (( ! INDEX_END )); do echo "SCRAPE_ESCRIBE: Downloading eScribe index..." wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress if [ $? -ne 8 ]; then FOUNDLIST=0 while IFS= read -r LINE; do if (( FOUNDLIST )); then GREPENDLIST=$(echo $LINE | grep '