Upload files to "/"

2026-04-07 19:49:40 -04:00 · 2026-04-07 19:49:40 -04:00 · 58c637e051
commit 58c637e051
parent 0793fe0036
6 changed files with 44 additions and 76 deletions
--- a/README.MD
+++ b/README.MD
@ -17,8 +17,7 @@ YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS!
 ## Scrape eScribe meetings (SCRAPE_MEET.SH)
-This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST="TRUE"`, meetings older than 2 months will be downloaded. Otherwise, they will be skipped.
+This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST=1` (true), meetings older than 2 months will be downloaded. Otherwise, they will be skipped.
 Don't ask why "TRUE" is a string and not a boolean...
 The basic structure of the output files is:
 ```
--- a/SCRAPE_ESCRIBE.SH
+++ b/SCRAPE_ESCRIBE.SH
@ -38,18 +38,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
  CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
  CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
-	INDEX_END="FALSE"
+	INDEX_END=0
-	while [[ $INDEX_END == "FALSE" ]]; do
+	while (( ! INDEX_END )); do
 	  echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
 	  wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
 		if [ $? -ne 8 ]; then
-	    FOUNDLIST="FALSE"
+	    FOUNDLIST=0
 	    while IFS= read -r LINE; do
-				if [[ "TRUE" == $FOUNDLIST ]]; then
+				if (( FOUNDLIST )); then
 					GREPENDLIST=$(echo $LINE | grep '<option ')
 					if [[ "$GREPENDLIST" == "" ]]; then
 						echo "SCRAPE_ESCRIBE: End of list."
-						INDEX_END="TRUE"
+						INDEX_END=1
 						break
 					else
 	          MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
@ -88,11 +88,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
 				GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
 				if [[ "$GREPLIST" != "" ]]; then
 					echo "SCRAPE_ESCRIBE: Found meeting type list."
-				  FOUNDLIST="TRUE"
+				  FOUNDLIST=1
 				fi
 	    done < $INDEX_PAGE
 		else
-	    INDEX_END="TRUE"
+	    INDEX_END=1
 	    echo "SCRAPE_ESCRIBE: Couldn't save index!"
 		fi
 	done
--- a/SCRAPE_LTC.SH
+++ b/SCRAPE_LTC.SH
@ -112,11 +112,11 @@ while IFS= read -r LINE_PRE; do
 						GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
 						if [[ "$GREPARTICLESTART" != "" ]]; then
 							echo "            FOUND INDEX ARTICLE START"
-							ISARTICLE="TRUE"
+							ISARTICLE=1
 						elif [[ "$GREPARTICLEEND" != "" ]]; then
 							echo "            END OF INDEX ARTICLE"
-							ISARTICLE=""
+							ISARTICLE=0
-						elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then
+						elif [[ "$GREPLINK" != "" ]] && (( ISARTICLE )); then
 							mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
 							ISPDF=$(echo $GREPLINK | grep "\.pdf")
 							if [[ "$ISPDF" != "" ]]; then
@ -141,14 +141,14 @@ while IFS= read -r LINE_PRE; do
 										# CSS for the HTML is in the default template
 										cat ./template/default.html > ./tmp/new.html
 										echo "$LINE_ATTACH" >> ./tmp/new.html
-										ISATTACHMENTARTICLE="TRUE"
+										ISATTACHMENTARTICLE=1
 									elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
 										echo "                    END OF ATTACHMENT ARTICLE"
 										echo "$LINE_ATTACH" >> ./tmp/new.html
 										echo "                    PROCESSED TO PDF"
 										wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
-										ISATTACHMENTARTICLE=""
+										ISATTACHMENTARTICLE=0
-									elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then
+									elif [[ "$GREPATTACHMENTLINK" != "" ]] && (( ISATTACHMENTARTICLE )); then
 										ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
 										if [[ "$ISREFPDF" != "" ]]; then
 											PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
@ -157,7 +157,7 @@ while IFS= read -r LINE_PRE; do
 											wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
 											echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
 										fi
-									elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then
+									elif (( ISATTACHMENTARTICLE )); then
 										echo "$LINE_ATTACH" >> ./tmp/new.html
 									fi
 									LINE_ATTACH=""
--- a/SCRAPE_MEET.SH
+++ b/SCRAPE_MEET.SH
@ -198,7 +198,7 @@ current_year=$(date +%Y)
 current_month=$(date +%m)
 current_day=$(date +%d)00
-SUPPORT_PAST="TRUE"
+SUPPORT_PAST=1
 if [ -d "$TEMP_DIR" ]; then
  rm -r $TEMP_DIR
@ -214,18 +214,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
  CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
  CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
-        INDEX_END="FALSE"
+        INDEX_END=0
-        while [[ $INDEX_END == "FALSE" ]]; do
+        while (( ! INDEX_END )); do
          echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
          wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
                if [ $? -ne 8 ]; then
-            FOUNDLIST="FALSE"
+            FOUNDLIST=0
            while IFS= read -r LINE; do
-                                if [[ "TRUE" == $FOUNDLIST ]]; then
+                                if  (( FOUNDLIST )); then
                                        GREPENDLIST=$(echo $LINE | grep '<option ')
                                        if [[ "$GREPENDLIST" == "" ]]; then
                                                echo "SCRAPE_ESCRIBE: End of list."
-                                                INDEX_END="TRUE"
+                                                INDEX_END=1
                                                break
                                        else
                  MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
@ -312,11 +312,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
                    else
                        echo "Dates are in the past!"
                        echo "DATE  : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
-                        INPAST="TRUE"
+                        INPAST=1
                    fi
                    # I think "break" broke when I did nested loops. idk I'm too drunk for this.
-                    if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
+                    if (( INPAST )) && (( ! SUPPORT_PAST )); then
                        echo "Abort."
                        break
                    fi
@ -325,8 +325,8 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
                                #cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
                                VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
-                                                        ERROR="FALSE"
+                                                        ERROR=0
-                                                        ADDENDUM_ERROR="FALSE"
+                                                        ADDENDUM_ERROR=0
                    echo "Downloading agenda HTML..."
                    if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
                      wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
@ -339,16 +339,16 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
                    elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
                      wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
                    else
-                      ERROR="TRUE"
+                      ERROR=1
                    fi
                    if [[ $ADDENDUM_HTML_URL != "" ]]; then
                      wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress
                    else
-                      ADDENDUM_ERROR="TRUE"
+                      ADDENDUM_ERROR=1
                    fi
-                                                        if [[ "$ERROR" == "FALSE" ]]; then
+                                                        if (( ! ERROR )); then
                      mkdir "./$CITY_ARCHIVE_NAME"
                      mkdir "./$CITY_ARCHIVE_NAME/Meetings"
@ -376,7 +376,7 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
                                                                cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
                                                                # Get attachment names
                                                                cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
-                      if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
+                      if (( ! ADDENDUM_ERROR )); then
                                                                        # Get attachment links
                                                                        cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
                                                                        # Get attachment names
@ -410,11 +410,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
                                GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
                                if [[ "$GREPLIST" != "" ]]; then
                                        echo "SCRAPE_ESCRIBE: Found meeting type list."
-                                  FOUNDLIST="TRUE"
+                                  FOUNDLIST=1
                                fi
            done < $INDEX_PAGE
                else
-            INDEX_END="TRUE"
+            INDEX_END=1
            echo "SCRAPE_ESCRIBE: Couldn't save index!"
                fi
        done
--- a/SCRAPE_PLAN.SH
+++ b/SCRAPE_PLAN.SH
@ -81,8 +81,8 @@ mkdir $TEMP_DIR
 SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
 j=0
-SEARCH_END="FALSE"
+SEARCH_END=0
-while [[ $SEARCH_END == "FALSE" ]]; do
+while (( ! SEARCH_END )); do
    echo "-========================================================================-"
    echo "Downloading search results... Page $j"
    wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
@ -124,8 +124,8 @@ while [[ $SEARCH_END == "FALSE" ]]; do
                    rm -f $PROJECT_IMAGE_NAMES
                    while IFS= read -r PLINE; do
-                        if [[ "$NEXT_LINE_FITEM" == "TRUE" ]]; then
+                        if (( NEXT_LINE_FITEM )); then
-                            NEXT_LINE_FITEM="FALSE"
+                            NEXT_LINE_FITEM=0
                            # Is this line an actual item?
                            PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
@ -215,8 +215,8 @@ while [[ $SEARCH_END == "FALSE" ]]; do
                            fi
                        fi
-                        if [[ "$NEXT_LINE_IMAGE" == "TRUE" ]]; then
+                        if (( NEXT_LINE_IMAGE )); then
-                            NEXT_LINE_IMAGE="FALSE"
+                            NEXT_LINE_IMAGE=0
                            PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
                            PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
                            if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
@ -243,14 +243,14 @@ while [[ $SEARCH_END == "FALSE" ]]; do
                        PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
                        if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
                            PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
-                            NEXT_LINE_FITEM="TRUE"
+                            NEXT_LINE_FITEM=1
                            # Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
                            # We're setting a flag to let the script know if an upcoming line is contents.
                        fi
                        PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
                        if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
-                            NEXT_LINE_IMAGE="TRUE"
+                            NEXT_LINE_IMAGE=1
                            # Same idea as before but for the image shown on the main page.
                        fi
@ -340,11 +340,11 @@ while [[ $SEARCH_END == "FALSE" ]]; do
                fi
            done < $SEARCH_PAGE
        else
-            SEARCH_END="TRUE"
+            SEARCH_END=1
            echo "No more pages!"
        fi
    else
-        SEARCH_END="TRUE"
+        SEARCH_END=1
        echo "No more pages!"
    fi
    ((j++))
--- a/websites.csv
+++ b/websites.csv
@ -1,34 +1,3 @@
-"https://pub-brampton.escribemeetings.com/", "SubBramptonArchive", ""
+"https://pub-london.escribemeetings.com/", "LondonArchive", ""
-"https://pub-markham.escribemeetings.com/", "SubMarkhamArchive", ""
+"https://pub-stthomas.escribemeetings.com/", "StThomasArchive", ""
 "https://pub-cityofkingston.escribemeetings.com/", "SubKingstonArchive", ""
 "https://pub-barrie.escribemeetings.com/", "SubBarrieArchive", ""
 "https://pub-oshawa.escribemeetings.com/", "SubOshawaArchive", ""
 "https://pub-ottawa.escribemeetings.com/", "OttawaArchive", ""
 "https://pub-owensound.escribemeetings.com/", "SubOwenSoundArchive", ""
 "https://pub-goderich.escribemeetings.com/", "SubGoderichArchive", ""
 "https://pub-oakville.escribemeetings.com/", "SubOakvilleArchive", ""
 "https://burlingtonpublishing.escribemeetings.com/", "SubBurlingtonArchive", ""
 "https://pub-milton.escribemeetings.com/", "SubMiltonArchive", ""
 "https://pub-durhamregion.escribemeetings.com/", "SubDurhamArchive", ""
 "https://pub-richmondhill.escribemeetings.com/", "SubRichmondHillArchive", ""
 "https://pub-whitby.escribemeetings.com/", "SubWhitbyArchive", ""
 "https://pub-london.escribemeetings.com/", "LondonArchive", "London Meetings"
 "https://pub-middlesexcounty.escribemeetings.com/", "SubMiddlesexCountyArchive", ""
 "https://pub-lucanbiddulph.escribemeetings.com/", "SubLucanBiddulphArchive", ""
 "https://pub-thamescentre.escribemeetings.com/", "SubThamesCentreArchive", ""
 "https://pub-stthomas.escribemeetings.com/", "SubStThomasArchive", ""
 "https://pub-northmiddlesex.escribemeetings.com/", "SubNorthMiddlesexArchive", ""
 "https://pub-strathroy-caradoc.escribemeetings.com/", "SubStrathroyCaradocArchive", ""
 "https://pub-adelaidemetcalfe.escribemeetings.com/", "SubAdelaideMetcalfeArchive", ""
 "https://pub-middlesexcentre.escribemeetings.com/", "SubMiddsexCentreArchive", ""
 "https://pub-mississauga.escribemeetings.com/", "SubMississaugaArchive", ""
 "https://pub-guelph.escribemeetings.com/", "SubGuelphArchive", ""
 "https://pub-regionofwaterloo.escribemeetings.com/", "SubWaterlooArchive", ""
 "https://pub-kitchener.escribemeetings.com/", "SubKitchenerArchive", ""
 "https://pub-hamilton.escribemeetings.com/", "SubHamiltonArchive", ""
 "https://pub-brantford.escribemeetings.com/", "SubBrantfordArchive", ""
 "https://pub-woodstock.escribemeetings.com/", "SubWoodstockArchive", ""
 "https://pub-stratford.escribemeetings.com/", "SubStratfordArchive", ""
 "https://pub-chatham-kent.escribemeetings.com/", "SubChathamKentArchive", ""
 "https://pub-cambridge.escribemeetings.com/", "SubCambridgeArchive", ""
 "https://pub-vaughan.escribemeetings.com/", "SubVaughanArchive", ""