gist-1b471e5f5215c368fd78d9.../SCRAPE_LONDON.SH

#/bash
echo -e "\n-========================================================================-"
echo -e "-=-                                                                    -=-"
echo -e "-=-      SCRAPE_LONDON.SH: Downloads committee videos and agendas      -=-"
echo -e "-=-                                                                    -=-"
echo -e "-=-    https://gist.github.com/rvtr/1b471e5f5215c368fd78d9aba05f8dc2   -=-"
echo -e "-=-     Lillian Skinner                                                -=-"
echo -e "-=-                                                                    -=-"
echo -e "-========================================================================-"

# Warning to all who read this script:
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.

# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"

TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index.html"
AGENDA_DIR="./Agenda/"
AGENDA_HTML="./tmp/work.html"
VIDEO_DIR="./Video/"
#VIDEO_TIMESTAMP_JSON="./tmp/time.json"

if [ -d "$TEMP_DIR" ]; then
  rm -r $TEMP_DIR
fi
if [ -d "$AGENDA_DIR" ]; then
  rm -r $AGENDA_DIR
fi
if [ -d "$VIDEO_DIR" ]; then
  rm -r $VIDEO_DIR
fi
mkdir $TEMP_DIR
mkdir $AGENDA_DIR
mkdir $VIDEO_DIR

SEARCH_URL="https://london.ca/government/council-civic-administration/council-committee-meetings/meetings"
# Need to confirm. When stacking params does the date need to be f[1]?
SEARCH_PARAM_COMMITTEE="f[0]=meeting_type%3A"
SEARCH_PARAM_DATE="f[0]=meeting_date%3A"
SEARCH_PARAM_QUERY="search=query&sort_by=field_meeting_date"

i=0
SEARCH_END="FALSE"
while [[ $SEARCH_END == "FALSE" ]]; do
    echo "Downloading search results... Page: $i"
        wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$i" -O $SEARCH_PAGE -q #--show-progress
        if [ $? -ne 8 ]; then
                FOUNDMEETING="FALSE"

                GREP404=$(cat $SEARCH_PAGE | grep "No results found.")
                if [[ "$GREP404" == "" ]]; then
                        while IFS= read -r LINE; do

                                # All meeting items in the search results are formatted like so:
                                # - One line with the name
                                # - Second line with all other info including links
                                #
                                # We can find the first line by the class "views-field-field-meeting-notes"
                                # FOUNDMEETING=TRUE will show that the first line has been found, and so the next line read will be "confirmed" as line 2 of the meeting info
                                # The first two links of every second line are (in order) the PDF and HTML agendas

                                if [[ "TRUE" == $FOUNDMEETING ]]; then
                                        FOUNDMEETING="FALSE"
                                        echo "-========================================================================-"
                                        echo " Working on $MEETING_NAME"
                                        echo "-========================================================================-"

                                        # Grab meeting item links
                                        echo $LINE | sed 's/href=./\nhref="/g' | grep 'href="https' | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq > "./tmp/meeting_urls"
                                        # Grab meeting item types
                                        echo $LINE | sed 's/rel=.noreferrer.>/\nrel="noreferrer">/g' | grep 'rel="noreferrer">' | sed 's/.*rel="noreferrer">\([^<]*\)<.*/\1/p' | uniq > "./tmp/meeting_types"

                                        AGENDA_HTML_URL=""
                                        AGENDA_PDF_URL=""
                                        AGENDA_REVISE_HTML_URL=""
                                        AGENDA_REVISE_PDF_URL=""
                                        MINUTES_HTML_URL=""
                                        MINUTES_PDF_URL=""
                                        MINUTES_ATTACH_PDF_URL=""

                                        echo "Found the following documents:"
                                        while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
                                            echo " - $LINEA2"

                                            case "$LINEA2" in
                                                "Agenda (HTML) ")
                                                    AGENDA_HTML_URL="$LINEA1" ;;
                                                "Agenda (PDF) ")
                                                    AGENDA_PDF_URL="$LINEA1" ;;
                                                "Revised Agenda (HTML) ")
                                                    AGENDA_REVISE_HTML_URL="$LINEA1" ;;
                                                "Revised Agenda (PDF) ")
                                                    AGENDA_REVISE_PDF_URL="$LINEA1" ;;
                                                "Minutes (HTML) ")
                                                    MINUTES_HTML_URL="$LINEA1" ;;
                                                "Minutes (PDF) ")
                                                    MINUTES_PDF_URL="$LINEA1" ;;
                                                "Minutes with Attachments (PDF) ")
                                                    MINUTES_ATTACH_PDF_URL="$LINEA1" ;;
                                            esac

                                        done < ./tmp/meeting_urls 3< ./tmp/meeting_types

                                        # Always prefer Revised Agendas
                                        echo "Downloading agenda HTML..."
                    if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
                        wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
                    elif [[ $AGENDA_HTML_URL != "" ]]; then
                        wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress
                    else
                        ERROR="TRUE"
                    fi

                                        if [[ ERROR="FALSE" ]]; then
                                                mkdir "./tmp/$MEETING_NAME/"
                                                mkdir "./tmp/$MEETING_NAME/Attachments/"

                                                # Direct video links is always "video.isilive.ca/<REGION>/<NAME>"
                                                # There are some eScribe ones, but those are in m3u8s and are really annoying to work with

                                                # ...not annoying as more sed though.
                                                VIDEO_URL=$(grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g')

                        if [[ $VIDEO_URL != "" ]]; then
                            echo "Found meeting recording."
                            echo "https://video.isilive.ca/london/"$VIDEO_URL > "./tmp/$MEETING_NAME/RecordingLink.txt"
                        fi

                                                # Get attachment links
                                                cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
                                                # Get attachment names
                                                cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.\([^'\''/]*\)'\''.*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_names"
                                                # Download attachment and use the name grabbed above
                                                echo "Found the following agenda attachments:"
                                                while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
                                                  echo " - $LINEA2"
                                                  wget --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "./tmp/$MEETING_NAME/Attachments/$LINEA2" -q #--show-progress
                                                done < ./tmp/attachment_urls 3< ./tmp/attachment_names
                                                echo "All attachments saved."

                                                if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then
                                    if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then
                                        echo "Saving revised agenda..."
                                        wget --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "./tmp/$MEETING_NAME/Agenda_Revised.pdf" -q #--show-progress
                                    fi
                                    if [[ $AGENDA_PDF_URL != "" ]]; then
                                        echo "Saving regular agenda..."
                                        wget --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "./tmp/$MEETING_NAME/Agenda.pdf" -q #--show-progress
                                    fi
                                else
                                    if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
                                        echo "Saving revised agenda as HTML (no PDF found!)"
                                        wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "./tmp/$MEETING_NAME/Agenda_Revised.html" -q #--show-progress
                                    fi
                                    if [[ $AGENDA_HTML_URL != "" ]]; then
                                        echo "Saving regular agenda as HTML (no PDF found!)"
                                        wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "./tmp/$MEETING_NAME/Agenda.html" -q #--show-progress
                                    fi
                                fi

                                                if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then
                                    if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then
                                        echo "Saving minutes with attachments..."
                                        wget --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "./tmp/$MEETING_NAME/Minutes_With_Attachments.pdf" -q #--show-progress
                                    fi
                                    if [[ $MINUTES_PDF_URL != "" ]]; then
                                        echo "Saving minutes..."
                                        wget --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "./tmp/$MEETING_NAME/Minutes.pdf" -q #--show-progress
                                    fi
                                else
                                    if [[ $MINUTES_HTML_URL != "" ]]; then
                                        echo "Saving minutes as HTML (no PDF found!)"
                                        wget --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "./tmp/$MEETING_NAME/Minutes.html" -q #--show-progress
                                    fi
                                fi

                                        fi
                                fi

                                GREPMEETING=$(echo $LINE | grep "views-field-field-meeting-notes")
                                if [[ "$GREPMEETING" != "" ]]; then
                                        FOUNDMEETING="TRUE"
                                        MEETING_NAME=$(echo $LINE | sed -n 's/.*<div class="meeting__date">\([^<]*\)<\/div>.*/\1/p')
                                fi
                        done < $SEARCH_PAGE
                else
                        SEARCH_END="TRUE"
                fi
        else
            SEARCH_END="TRUE"
        fi
        ((i++))
done