commit e6058497d372d7add84365f33ef3ded93c709aef Author: Lillian Skinner <56081713+rvtr@users.noreply.github.com> Date: Fri May 16 04:44:00 2025 -0400 diff --git a/SCRAPE_LONDON.SH b/SCRAPE_LONDON.SH new file mode 100644 index 0000000..8be4895 --- /dev/null +++ b/SCRAPE_LONDON.SH @@ -0,0 +1,107 @@ +#/bash +echo -e "\n-========================================================================-" +echo -e "-=- -=-" +echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-" +echo -e "-=- -=-" +echo -e "-========================================================================-" + +# Warning to all who read this script: +# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works. + +# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! +WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" + +TEMP_DIR="./tmp/" +SEARCH_PAGE="./tmp/index.html" +AGENDA_DIR="./Agenda/" +AGENDA_HTML="./tmp/work.html" +VIDEO_DIR="./Video/" +#VIDEO_TIMESTAMP_JSON="./tmp/time.json" + +if [ -d "$TEMP_DIR" ]; then + rm -r $TEMP_DIR +fi +if [ -d "$AGENDA_DIR" ]; then + rm -r $AGENDA_DIR +fi +if [ -d "$VIDEO_DIR" ]; then + rm -r $VIDEO_DIR +fi +mkdir $TEMP_DIR +mkdir $AGENDA_DIR +mkdir $VIDEO_DIR + +SEARCH_URL="https://london.ca/government/council-civic-administration/council-committee-meetings/meetings" +# Need to confirm. When stacking params does the date need to be ?f[1]? +SEARCH_PARAM_COMMITTEE="f[0]=meeting_type%3A" +SEARCH_PARAM_DATE="f[0]=meeting_date%3A" +SEARCH_PARAM_QUERY="search=query&sort_by=field_meeting_date" + +i=0 +SEARCH_END="FALSE" +while [[ $SEARCH_END == "FALSE" ]]; do + wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$i" -O $SEARCH_PAGE + if [ $? -ne 8 ]; then + FOUNDMEETING="FALSE" + + GREP404=$(cat $SEARCH_PAGE | grep "No results found.") + if [[ "$GREP404" == "" ]]; then + while IFS= read -r LINE; do + + # All meeting items in the search results are formatted like so: + # - One line with the name + # - Second line with all other info including links + # + # We can find the first line by the class "views-field-field-meeting-notes" + # FOUNDMEETING=TRUE will show that the first line has been found, and so the next line read will be "confirmed" as line 2 of the meeting info + # The first two links of every second line are (in order) the PDF and HTML agendas + + if [[ "TRUE" == $FOUNDMEETING ]]; then + FOUNDMEETING="FALSE" + + AGENDA_URLS=$(echo $LINE | sed 's//" + # There are some eScribe ones, but those are in m3u8s and are really annoying to work with + + # ...not annoying as more sed though. + VIDEO_URL=$(printf "https://video.isilive.ca/london/"; grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g') + + # Finalize everything + mkdir "./tmp/$MEETING_NAME/" + mkdir "./tmp/$MEETING_NAME/Attachments/" + wget --user-agent="$WGET_UA" $AGENDA_PDF_URL -O "./tmp/$MEETING_NAME/Agenda.pdf" + echo $VIDEO_URL >> "./tmp/$MEETING_NAME/RecordingLink.txt" + # Get attachment links + cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls" + # Get attachment names + cat $AGENDA_HTML | grep "AgendaItemAgendaItem1TitleHeader" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.\([^'\''/]*\)'\''.*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_names" + # Download attachment and use the name grabbed above + while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do + wget --user-agent="$WGET_UA" https://pub-london.escribemeetings.com/$LINEA1 -O "./tmp/$MEETING_NAME/Attachments/$LINEA2" + echo "Here are the datas: https://pub-london.escribemeetings.com/$LINEA1/n$LINEA2" + done < ./tmp/attachment_urls 3< ./tmp/attachment_names + fi + + GREPMEETING=$(echo $LINE | grep "views-field-field-meeting-notes") + if [[ "$GREPMEETING" != "" ]]; then + FOUNDMEETING="TRUE" + MEETING_NAME=$(echo $LINE | sed -n 's/.*
\([^<]*\)<\/div>.*/\1/p') + echo $MEETING_NAME + fi + done < $SEARCH_PAGE + else + SEARCH_END="TRUE" + fi + else + SEARCH_END="TRUE" + fi + ((i++)) +done