Separate functions

This commit is contained in:
Lillian Skinner 2026-06-19 23:30:51 -04:00
parent 3bce46e582
commit 16c4905b41
No known key found for this signature in database
GPG Key ID: 17F0E72D2C98B0A6
20 changed files with 1531 additions and 545 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
LondonArchive/
LondonScrapers_privdata/
tmp/
staging/

6
README.MD Normal file → Executable file
View File

@ -17,7 +17,7 @@ YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS!
## Scrape eScribe meetings (SCRAPE_MEET.SH) ## Scrape eScribe meetings (SCRAPE_MEET.SH)
This bash script will scrape meetings from the eScribe meetings platform. This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST=1` (true), meetings older than 2 months will be downloaded. Otherwise, they will be skipped.
The basic structure of the output files is: The basic structure of the output files is:
``` ```
@ -76,7 +76,7 @@ The basic structure of the output files is:
This bash script will scrape LTC meetings from their wordpress site at: https://www.londontransit.ca/agendas-and-minutes/ This bash script will scrape LTC meetings from their wordpress site at: https://www.londontransit.ca/agendas-and-minutes/
Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low. Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low. The HTML --> PDF conversion needs the template page included at `./template/default.html`.
The basic structure of the output files is: The basic structure of the output files is:
``` ```
@ -102,4 +102,4 @@ The basic structure of the output files is:
|- <attachment 1>.pdf |- <attachment 1>.pdf
|- <attachment 2>.pdf |- <attachment 2>.pdf
\- etc etc \- etc etc
``` ```

66
SCRAPE_AGIS.SH Executable file
View File

@ -0,0 +1,66 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_AGIS.SH: Downloads ArcGIS maps -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
ARGIS_URL="https://maps.london.ca/server/rest/services"
TMP="./tmp"
TMP_STAGING="./tmp/layers"
SERVICELIST_JSON="$TMP/servicelist.json"
FOLDER_JSON="$TMP/folder.json"
SERVICE_JSON="$TMP/service.json"
LAYERQUERY_JSON="$TMP/layer_query.json"
mkdir "$TMP"
mkdir "$TMP_STAGING"
wget "$ARGIS_URL?f=json" --user-agent="$WGET_UA" -O "$SERVICELIST_JSON" -q
jq -r '.folders[]?' "$SERVICELIST_JSON" | while read -r FOLDER; do
wget "$ARGIS_URL/$FOLDER?f=json" --user-agent="$WGET_UA" -O "$FOLDER_JSON" -q
echo "Looking in $FOLDER"
jq -r '.services[]
| select(.type=="MapServer")
| .name' "$FOLDER_JSON" | while read -r SERVICE; do
echo "Found $SERVICE"
SERVICE_PATH="$FOLDER/$SERVICE"
echo "$ARGIS_URL/$SERVICE/MapServer"
wget "$ARGIS_URL/$SERVICE/MapServer?f=json" --user-agent="$WGET_UA" -O "$SERVICE_JSON" -q
mkdir -p "LondonArchive/ArcGIS/${SERVICE}"
jq -r '.layers[]? | "\(.id)|\(.name)"' "$SERVICE_JSON" | while IFS='|' read -r LAYERID LAYERNAME; do
rm -r "$TMP_STAGING"
mkdir "$TMP_STAGING"
LAYERNAME_CLEAN=$(echo $LAYERNAME | sed 's/\// /g' | sed 's/\\/ /g' | sed -E 's/ {2,}/ /g')
curl -s "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&returnCountOnly=true&f=json" -o "$TMP/count.json"
ITEM_COUNT=$(jq -r '.count' "$TMP/count.json")
MAX_REQUESTS=2000
i=0
j=0
while (( i <= ITEM_COUNT )); do
echo "Downloading $LAYERID-${j} $LAYERNAME_CLEAN"
echo "$i of $ITEM_COUNT"
_utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=geojson" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.geojson"
echo "Done GeoJSON!"
_utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=kmz" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.kmz"
echo "Done KMZ!"
i=$(( i + MAX_REQUESTS ))
((j++))
done
7z a "LondonArchive/ArcGIS/${SERVICE}/Layer ${LAYERID} - ${LAYERNAME_CLEAN}.7z" "$TMP_STAGING"
done
done
done

14
SCRAPE_ESCRIBE.SH Normal file → Executable file
View File

@ -38,18 +38,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
INDEX_END="FALSE" INDEX_END=0
while [[ $INDEX_END == "FALSE" ]]; do while (( ! INDEX_END )); do
echo "SCRAPE_ESCRIBE: Downloading eScribe index..." echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
if [ $? -ne 8 ]; then if [ $? -ne 8 ]; then
FOUNDLIST="FALSE" FOUNDLIST=0
while IFS= read -r LINE; do while IFS= read -r LINE; do
if [[ "TRUE" == $FOUNDLIST ]]; then if (( FOUNDLIST )); then
GREPENDLIST=$(echo $LINE | grep '<option ') GREPENDLIST=$(echo $LINE | grep '<option ')
if [[ "$GREPENDLIST" == "" ]]; then if [[ "$GREPENDLIST" == "" ]]; then
echo "SCRAPE_ESCRIBE: End of list." echo "SCRAPE_ESCRIBE: End of list."
INDEX_END="TRUE" INDEX_END=1
break break
else else
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g') MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
@ -88,11 +88,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"') GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
if [[ "$GREPLIST" != "" ]]; then if [[ "$GREPLIST" != "" ]]; then
echo "SCRAPE_ESCRIBE: Found meeting type list." echo "SCRAPE_ESCRIBE: Found meeting type list."
FOUNDLIST="TRUE" FOUNDLIST=1
fi fi
done < $INDEX_PAGE done < $INDEX_PAGE
else else
INDEX_END="TRUE" INDEX_END=1
echo "SCRAPE_ESCRIBE: Couldn't save index!" echo "SCRAPE_ESCRIBE: Couldn't save index!"
fi fi
done done

329
SCRAPE_GINV.SH Executable file
View File

@ -0,0 +1,329 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# Todo:
# - Save updates (see bradley-ave)
# - Order, title, and collapse each scraped modal
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_ginv.html"
PROJECT_PAGE="./tmp/project_ginv.html"
WORK_HTML="./tmp/tmp.html"
CUSTOM_HTML="./tmp/custom_ginv.html"
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
CUSTOM_HTML_TIMELINE="./tmp/custom_timeline_ginv.html"
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
FULLDUMP="./tmp/.fulldump.txt"
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://getinvolved.london.ca/projects"
set_metadata() {
tmp=$(echo "$1" | sed 's/&amp;/\&/g' | sed 's/&quot;//g' | sed 's/&#039;/'\''/g' | sed 's/\[/''/g' | sed 's/\]/''/g')
PROJECT_NAME=$(_utils_fix_dashes "$(echo $tmp | sed 's/.*data-project-name="\([^"]*\)".*/\1/' | sed 's///g' | sed 's///g' | sed 's/'\''//g' | sed 's/://g')")
PROJECT_CATS=$(echo "$tmp" | sed 's/.*data-project-category="\([^"]*\)".*/\1/')
PROJECT_LOCATION=$(echo "$tmp" | sed 's/.*data-project-location="\([^"]*\)".*/\1/')
}
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
while IFS= read -r LINE; do
if (( FOUND_DATE )) && [[ "$LAST_LINE" == "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
FOUND_DATE=0
echo $PROJECT_URL
echo $PROJECT_NAME
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
# Now we can work on the actual project page.
rm -f $CUSTOM_HTML_LINKS
rm -f $CUSTOM_HTML_PHOTOS
rm -f $CUSTOM_HTML_FAQ
rm -f $CUSTOM_HTML_PROFILE
rm -f $CUSTOM_HTML_TIMELINE
rm -f $CUSTOM_HTML_KEYDATES
rm -f $CUSTOM_HTML_SLIDER
rm -f $FULLDUMP
cat ./template/default_getinvolved.html > $CUSTOM_HTML
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
while IFS= read -r LINE_PROJ; do
if (( NEXT_LINE_CONTENT )); then
# Next hive-block marks end of current item
if [[ "$LINE_PROJ" == *"hive-block"* ]] || [[ "$LINE_PROJ" == "" ]]; then
NEXT_LINE_CONTENT=0
echo "End of current content."
else
# Ignore boring notices
if [[ "$LINE_PROJ" != *"</h1>"* ]] &&
[[ "$LINE_PROJ" != *"City of London Land Acknowledgement"* ]] &&
[[ "$LINE_PROJ" != *"Ongoing Site Specific Planning Applications"* ]] &&
[[ "$LINE_PROJ" != *"This site is owned and operated by the City of London using software licensed from Social Pinpoint"* ]] &&
[[ "$LINE_PROJ" != *"Social Pinpoint has been commissioned by City of London (Canada) to collect and display user content on their behalf"* ]] &&
[[ "$LINE_PROJ" != *"Notice of Collection of Personal Information"* ]] &&
[[ "$LINE_PROJ" != *'href="/register"'* ]] &&
[[ "$LINE_PROJ" != *'href="/login"'* ]] &&
[[ "$LINE_PROJ" != *"Users have the right to access, correct, or delete their personal information"* ]] &&
[[ "$LINE_PROJ" != *"This privacy policy may change from time to time"* ]] &&
#[[ "$LINE_PROJ" != *"Share your feedback"* ]] &&
[[ "$LINE_PROJ" != *"Notice of Collection"* ]] &&
#[[ "$LINE_PROJ" != *"Subscribe for project updates"* ]] &&
[[ "$LINE_PROJ" != *"Ready to have your say?"* ]]; then
# seds to replace youtube iframe with a normal <a href=""> link. wkhtmltopdf obviously can't embed youtube videos.
if (( FIRST_CONTENT )); then
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
FIRST_CONTENT=0
fi
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
fi
fi
fi
if (( IS_DOC_BLOCK )); then
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
IS_DOC_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/download_file/")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_LINKS
fi
echo "End of current documents."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_PHOTO_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!-- end foreach -->"* ]]; then
IS_PHOTO_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
fi
echo "End of current photos."
else
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
fi
fi
fi
if (( IS_FAQ_BLOCK )); then
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
IS_FAQ_BLOCK=0
echo "End of current FAQ."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
fi
if (( IS_PROFILE_BLOCK )); then
if [[ "$LINE_PROJ" == *"<script>"* ]]; then
IS_PROFILE_BLOCK=0
echo "End of current profile."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_TIMELINE_BLOCK )); then
if [[ "$LINE_PROJ" == *"btn-unfill btn-primary"* ]]; then
IS_TIMELINE_BLOCK=0
echo "End of current timeline."
else
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_KEYDATES_BLOCK )); then
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
IS_KEYDATES_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SLIDER_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
IS_SLIDER_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SINGLE_IMAGE_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_SINGLE_IMAGE_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
else
cat "$CUSTOM_HTML_PHOTOS"
fi
echo "End of current single image."
else
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if [[ "$LINE_PROJ" == *"hive-block hive-block-content ljs"* ]]; then
NEXT_LINE_CONTENT=1
FIRST_CONTENT=1
# We'll write the LA comment inside of the content block.
# There we can ensure that the comment is only written if content does exist.
echo "Found content start."
elif [[ "$LINE_PROJ" == *"docLibModal hive-block-document-library"* ]]; then
IS_DOC_BLOCK=1
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
echo "Found documents start."
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
IS_PHOTO_BLOCK=1
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
echo "Found photos start."
elif [[ "$LINE_PROJ" == *"hive-modal faqModal hive-block-faq"* ]]; then
IS_FAQ_BLOCK=1
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
echo "Found FAQ start."
elif [[ "$LINE_PROJ" == *"hive-block-bio hive-block"* ]]; then
IS_PROFILE_BLOCK=1
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
echo "Found profile start."
elif [[ "$LINE_PROJ" == *"hive-block-timeline hive-block"* ]]; then
IS_TIMELINE_BLOCK=1
echo "<!-- LondonArchive_GINV_Timeline -->" >> $FULLDUMP
echo "Found timeline start."
elif [[ "$LINE_PROJ" == *"hive-modal dateModal"* ]]; then
IS_KEYDATES_BLOCK=1
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
echo "Found key dates start."
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
IS_SLIDER_BLOCK=1
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
echo "Found slider start."
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
IS_SINGLE_IMAGE_BLOCK=1
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
echo "Found single image start."
fi
done < $PROJECT_PAGE
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
#cat "$CUSTOM_HTML_LINKS" >> "$CUSTOM_HTML"
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"download_file"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME=$(curl -s -L -I "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" | grep "location:" | sed 's/location: //' | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//' | sed 's/\.pdf./\.pdf/')
echo $DOC_NAME
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_LINKS
fi
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"amazonaws"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
echo $DOC_NAME
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_PHOTOS
fi
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
cat ./template/default_getinvolved.html > $WORK_HTML
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
fi
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
fi
if (( NEXT_LINE_URL )); then
NEXT_LINE_URL=0
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
fi
if [[ "$LAST_LINE" != "" ]]; then
set_metadata "$LAST_LINE$LINE"
LAST_LINE=""
NEXT_LINE_URL=1
elif [[ "$LINE" == *"h-entry project card"* ]] && [[ "$LINE" == *"data-project-name"* ]] && [[ "$LINE" != *"<%-"* ]]; then
#echo $LINE
if [[ "$LINE" != *"data-project-category"* ]]; then
# Sometimes lines are split, so we'll combine the pieces over time.
LAST_LINE=$LINE
echo "Line is split!"
else
LAST_LINE=""
set_metadata "$LINE"
NEXT_LINE_URL=1
fi
elif [[ "$LINE" == *'time class="dt-updated"'* ]]; then
PROJECT_DATE=$(echo $LINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g')
echo $PROJECT_DATE
_time_parse_monddyyyy "$PROJECT_DATE"
echo "$ITEM_YEAR$ITEM_MONTH$ITEM_DAY"
FOUND_DATE=1
fi
done < $SEARCH_PAGE
fi

301
SCRAPE_GINV_OLD.SH Executable file
View File

@ -0,0 +1,301 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# Todo:
# - Save updates (see bradley-ave)
# - Order, title, and collapse each scraped modal
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_ginv.html"
PROJECT_PAGE="./tmp/project_ginv.html"
WORK_HTML="./tmp/tmp.html"
CUSTOM_HTML="./tmp/custom_ginv.html"
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
CUSTOM_HTML_UPDATE="./tmp/custom_update_ginv.html"
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
FULLDUMP="./tmp/.fulldump.txt"
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://getinvolvedlondon.ca.engagementhq.com"
wget --user-agent="$WGET_UA" "$SEARCH_URL/projects" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
while IFS= read -r LINE; do
if [[ "$PROJECT_NAME" != "" ]]; then
FOUND_DATE=0
echo $PROJECT_URL
echo $PROJECT_NAME
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
# Now we can work on the actual project page.
rm -f $CUSTOM_HTML_LINKS
rm -f $CUSTOM_HTML_PHOTOS
rm -f $CUSTOM_HTML_FAQ
rm -f $CUSTOM_HTML_PROFILE
rm -f $CUSTOM_HTML_UPDATE
rm -f $CUSTOM_HTML_KEYDATES
rm -f $CUSTOM_HTML_SLIDER
rm -f $FULLDUMP
cat ./template/default_getinvolved.html > $CUSTOM_HTML
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
while IFS= read -r LINE_PROJ; do
if (( IS_DOC_BLOCK )); then
if [[ "$LINE_PROJ" == "</ul>" ]]; then
IS_DOC_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/documents/")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_LINKS
fi
echo "End of current documents."
elif [[ "$LINE_PROJ" == *"a data-url"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
echo $LINE_PROJ >> $FULLDUMP
echo $LINE_PROJ
fi
fi
if (( IS_PHOTO_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_PHOTO_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
fi
echo "End of current photos."
else
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
fi
fi
fi
if (( IS_FAQ_BLOCK )); then
if [[ "$LINE_PROJ" == *"div class='clearfix'></div"* ]]; then
IS_FAQ_BLOCK=0
echo "End of current FAQ."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
fi
if (( IS_PROFILE_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!--[if IE]>"* ]]; then
IS_PROFILE_BLOCK=0
echo "End of current profile."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_UPDATE_BLOCK )); then
if [[ "$LINE_PROJ" == *"<div class='clearfix'></div>"* ]]; then
IS_UPDATE_BLOCK=0
echo "End of current update."
else
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_KEYDATES_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_KEYDATES_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SLIDER_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
IS_SLIDER_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SINGLE_IMAGE_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_SINGLE_IMAGE_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
else
cat "$CUSTOM_HTML_PHOTOS"
fi
echo "End of current single image."
else
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if [[ "$LINE_PROJ" == *'div class="full-description hide"'* ]]; then
FIRST_CONTENT=1
# We'll write the LA comment inside of the content block.
# There we can ensure that the comment is only written if content does exist.
echo "Found content start."
if (( FIRST_CONTENT )); then
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
FIRST_CONTENT=0
fi
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
elif [[ "$LINE_PROJ" == *"widget-wrap widget_document_library"* ]]; then
IS_DOC_BLOCK=1
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
echo "Found documents start."
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
IS_PHOTO_BLOCK=1
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
echo "Found photos start."
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_recent_photos'"* ]]; then
IS_FAQ_BLOCK=1
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
echo "Found FAQ start."
elif [[ "$LINE_PROJ" == *"widget-wrap widget_project_team"* ]]; then
IS_PROFILE_BLOCK=1
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
echo "Found profile start."
elif [[ "$LINE_PROJ" == *"<div class='fr-view'>"* ]]; then
IS_UPDATE_BLOCK=1
echo "<!-- LondonArchive_GINV_Update -->" >> $FULLDUMP
echo "<h1>Project Updates</h1>" >> $CUSTOM_HTML_UPDATE
echo "Found update start."
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_life_cycle'"* ]]; then
IS_KEYDATES_BLOCK=1
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
echo "Found key dates start."
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
IS_SLIDER_BLOCK=1
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
echo "Found slider start."
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
IS_SINGLE_IMAGE_BLOCK=1
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
echo "Found single image start."
fi
done < $PROJECT_PAGE
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
cat "$CUSTOM_HTML_LINKS" # >> "$CUSTOM_HTML"
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"/documents/"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME="$(echo $LINE_DOC | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/g' | sed 's/ (pdf)//' | sed 's/^ +| +$//g').pdf"
echo "-------- "$DOC_NAME
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')/download" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_LINKS
fi
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"ehq-production"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
echo "======== "$DOC_NAME
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_PHOTOS
fi
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
cat ./template/default_getinvolved.html > $WORK_HTML
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
fi
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
IS_DOC_BLOCK=0
IS_PHOTO_BLOCK=0
IS_FAQ_BLOCK=0
IS_PROFILE_BLOCK=0
IS_UPDATE_BLOCK=0
IS_KEYDATES_BLOCK=0
IS_SLIDER_BLOCK=0
IS_SINGLE_IMAGE_BLOCK=0
PROJECT_NAME=""
fi
if (( NEXT_LINE_CONT_NAME )); then
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LAST_LINE$LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's///g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's///g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/&quot;//g' | sed 's/&amp;/and/g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
NEXT_LINE_CONT_NAME=0
echo $PROJECT_NAME
fi
if [[ "$LINE" == *"project-tile__meta__name"* ]]; then
if [[ "$LINE" != *"</span"* ]]; then
NEXT_LINE_CONT_NAME=1
LAST_LINE=$LINE
else
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's///g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's///g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/&quot;//g' | sed 's/&amp;/and/g' | sed 's/&#39;//g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
echo $PROJECT_NAME
fi
fi
if [[ "$LINE" == *"project-tile__link"* ]]; then
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
PROJECT_URL=$(echo $SEARCH_URL$PROJECT_URL)
echo " "$PROJECT_URL
# Reset project name to mark the start of a new project
PROJECT_NAME=""
fi
done < $SEARCH_PAGE
fi

39
SCRAPE_LPS.SH Normal file → Executable file
View File

@ -1,4 +1,4 @@
#!/usr/bin/env bash #!/bin/bash
echo -e "\n-========================================================================-" echo -e "\n-========================================================================-"
echo -e "-=- -=-" echo -e "-=- -=-"
echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-" echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-"
@ -8,28 +8,7 @@ echo -e "-=- Lillian Skinner
echo -e "-=- -=-" echo -e "-=- -=-"
echo -e "-========================================================================-" echo -e "-========================================================================-"
conv_date() { source ./functions/.functions
echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
}
MEETINGS_PAGE="./tmp.html" MEETINGS_PAGE="./tmp.html"
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! # London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
@ -45,9 +24,9 @@ current_year=$(date +%Y)
current_month=$(date +%m) current_month=$(date +%m)
current_day=$(date +%d) current_day=$(date +%d)
# If I don't set these values then "10#: invalid integer constant" # If I don't set these values then "10#: invalid integer constant"
MEETING_YEAR="0000" ITEM_YEAR="0000"
MEETING_MONTH="00" ITEM_MONTH="00"
MEETING_DAY="00" ITEM_DAY="00"
while IFS= read -r LINE_PRE; do while IFS= read -r LINE_PRE; do
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /') LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
@ -66,11 +45,11 @@ while IFS= read -r LINE_PRE; do
FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">') FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">')
if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then
conv_date "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')" _time_parse_helper "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
echo "$MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY" echo "$ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')"
mkdir -p "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/" mkdir -p "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/"
wget "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" -O "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/$ATTACH_TYPE.pdf" -q _utils_download_helper "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/$ATTACH_TYPE.pdf"
fi fi
done < "./tmp/index.html" done < "./tmp/index.html"

69
SCRAPE_LTC.SH Normal file → Executable file
View File

@ -7,6 +7,8 @@ echo -e "-=- Lillian Skinner
echo -e "-=- -=-" echo -e "-=- -=-"
echo -e "-========================================================================-" echo -e "-========================================================================-"
source ./functions/.functions
MEETINGS_PAGE="./tmp.html" MEETINGS_PAGE="./tmp.html"
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! # London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
@ -21,9 +23,9 @@ current_year=$(date +%Y)
current_month=$(date +%m) current_month=$(date +%m)
current_day=$(date +%d) current_day=$(date +%d)
# If I don't set these values then "10#: invalid integer constant" # If I don't set these values then "10#: invalid integer constant"
MEETING_YEAR="0000" ITEM_YEAR="0000"
MEETING_MONTH="00" ITEM_MONTH="00"
MEETING_DAY="00" ITEM_DAY="00"
while IFS= read -r LINE_PRE; do while IFS= read -r LINE_PRE; do
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /') LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
@ -47,33 +49,16 @@ while IFS= read -r LINE_PRE; do
elif [[ "$GREPDATE" != "" ]]; then elif [[ "$GREPDATE" != "" ]]; then
# Remove HTML junk from date string. # Remove HTML junk from date string.
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//') DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
MEETING_MONTH_WORD=$(echo "$DATES_CLEAN" | sed -E 's/^([A-Za-z]+) .*/\1/')
MEETING_DAY_SHORT=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/')
MEETING_DAY=$(printf "%02d" ${MEETING_DAY_SHORT#0})
MEETING_YEAR=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/')
case "$MEETING_MONTH_WORD" in _time_parse_helper "$DATES_CLEAN"
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
echo " NEW MEETING FOUND" echo " NEW MEETING FOUND"
echo " DATE IS $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY" echo " DATE IS $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
GREPDATE="" GREPDATE=""
else else
# Has a previous meeting has been set? What about a date? # Has a previous meeting has been set? What about a date?
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts. # Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
if [[ "COMMITTEENAME" != "" ]] && [[ "MEETING_YEAR" != "" ]] && (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then if [[ "COMMITTEENAME" != "" ]] && [[ "ITEM_YEAR" != "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
# Not changing meetings, and we know that an old meeting has alread been set. Keep going. # Not changing meetings, and we know that an old meeting has alread been set. Keep going.
# If match --> make folder --> download # If match --> make folder --> download
@ -85,25 +70,25 @@ while IFS= read -r LINE_PRE; do
# Well... this aged well. # Well... this aged well.
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR" 2> /dev/null mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY" 2> /dev/null mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY" 2> /dev/null
if [[ "$AGENDAURL" != "" ]]; then if [[ "$AGENDAURL" != "" ]]; then
echo " DOWNLOAD AGENDA PDF" echo " DOWNLOAD AGENDA PDF"
echo " $AGENDAURL" echo " $AGENDAURL"
wget --user-agent="$WGET_UA" "$AGENDAURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Agenda.pdf" -c -q #--show-progress _utils_download_helper "$AGENDAURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Agenda.pdf"
elif [[ "$MINUTESURL" != "" ]]; then elif [[ "$MINUTESURL" != "" ]]; then
echo " DOWNLOAD MINUTES PDF" echo " DOWNLOAD MINUTES PDF"
echo " $MINUTESURL" echo " $MINUTESURL"
wget --user-agent="$WGET_UA" "$MINUTESURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Minutes.pdf" -c -q #--show-progress _utils_download_helper "$MINUTESURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Minutes.pdf"
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
if [[ "$AGENDAHTMLURL" != "" ]]; then if [[ "$AGENDAHTMLURL" != "" ]]; then
echo " DOWNLOAD AGENDA HTML TO CRAWL" echo " DOWNLOAD AGENDA HTML TO CRAWL"
echo " $AGENDAHTMLURL" echo " $AGENDAHTMLURL"
wget --user-agent="$WGET_UA" "$AGENDAHTMLURL" -O "./tmp/work.html" -q #--show-progress _utils_download_helper "$AGENDAHTMLURL" "./tmp/work.html"
elif [[ "$MINUTESHTMLURL" != "" ]]; then elif [[ "$MINUTESHTMLURL" != "" ]]; then
echo " DOWNLOAD MINUTES HTML TO CRAWL" echo " DOWNLOAD MINUTES HTML TO CRAWL"
echo " $MINUTESHTMLURL" echo " $MINUTESHTMLURL"
wget --user-agent="$WGET_UA" "$MINUTESHTMLURL" -O "./tmp/work.html" -q #--show-progress _utils_download_helper "$MINUTESHTMLURL" "./tmp/work.html"
fi fi
while IFS= read -r LINE_HTML_PRE; do while IFS= read -r LINE_HTML_PRE; do
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /') LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
@ -112,25 +97,25 @@ while IFS= read -r LINE_PRE; do
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//') GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
if [[ "$GREPARTICLESTART" != "" ]]; then if [[ "$GREPARTICLESTART" != "" ]]; then
echo " FOUND INDEX ARTICLE START" echo " FOUND INDEX ARTICLE START"
ISARTICLE="TRUE" ISARTICLE=1
elif [[ "$GREPARTICLEEND" != "" ]]; then elif [[ "$GREPARTICLEEND" != "" ]]; then
echo " END OF INDEX ARTICLE" echo " END OF INDEX ARTICLE"
ISARTICLE="" ISARTICLE=0
elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then elif [[ "$GREPLINK" != "" ]] && (( ISARTICLE )); then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments" 2> /dev/null
ISPDF=$(echo $GREPLINK | grep "\.pdf") ISPDF=$(echo $GREPLINK | grep "\.pdf")
if [[ "$ISPDF" != "" ]]; then if [[ "$ISPDF" != "" ]]; then
PDFNAME=$(echo $ISPDF | sed 's/.*\///') PDFNAME=$(echo $ISPDF | sed 's/.*\///')
echo " DOWNLOAD ATTACHMENT PDF" echo " DOWNLOAD ATTACHMENT PDF"
echo " $ISPDF" echo " $ISPDF"
wget --user-agent="$WGET_UA" "$ISPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFNAME" -c -q #--show-progress _utils_download_helper "$ISPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFNAME"
else else
# Extract title of attachment # Extract title of attachment
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&amp;/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g') ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&amp;/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
echo " DOWNLOAD ATTACHMENT HTML" echo " DOWNLOAD ATTACHMENT HTML"
echo " $ATTACHTITLE" echo " $ATTACHTITLE"
echo " $GREPLINK" echo " $GREPLINK"
wget --user-agent="$WGET_UA" "$GREPLINK" -O "./tmp/attachment.html" -q #--show-progress _utils_download_helper "$GREPLINK" "./tmp/attachment.html"
while IFS= read -r LINE_ATTACH_PRE; do while IFS= read -r LINE_ATTACH_PRE; do
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /') LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article") GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
@ -141,23 +126,23 @@ while IFS= read -r LINE_PRE; do
# CSS for the HTML is in the default template # CSS for the HTML is in the default template
cat ./template/default.html > ./tmp/new.html cat ./template/default.html > ./tmp/new.html
echo "$LINE_ATTACH" >> ./tmp/new.html echo "$LINE_ATTACH" >> ./tmp/new.html
ISATTACHMENTARTICLE="TRUE" ISATTACHMENTARTICLE=1
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
echo " END OF ATTACHMENT ARTICLE" echo " END OF ATTACHMENT ARTICLE"
echo "$LINE_ATTACH" >> ./tmp/new.html echo "$LINE_ATTACH" >> ./tmp/new.html
echo " PROCESSED TO PDF" echo " PROCESSED TO PDF"
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
ISATTACHMENTARTICLE="" ISATTACHMENTARTICLE=0
elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then elif [[ "$GREPATTACHMENTLINK" != "" ]] && (( ISATTACHMENTARTICLE )); then
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf") ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
if [[ "$ISREFPDF" != "" ]]; then if [[ "$ISREFPDF" != "" ]]; then
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///') PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
echo " DOWNLOAD REFERENCED ATTACHMENT PDF" echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
echo " $GREPATTACHMENTLINK" echo " $GREPATTACHMENTLINK"
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress _utils_download_helper "$ISREFPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFREFNAME"
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
fi fi
elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then elif (( ISATTACHMENTARTICLE )); then
echo "$LINE_ATTACH" >> ./tmp/new.html echo "$LINE_ATTACH" >> ./tmp/new.html
fi fi
LINE_ATTACH="" LINE_ATTACH=""

532
SCRAPE_MEET.SH Normal file → Executable file
View File

@ -1,185 +1,13 @@
#!/usr/bin/env bash #!/usr/bin/env bash
echo -e "\n-========================================================================-" echo -e "\n-========================================================================-"
echo -e "-=- -=-" echo -e "-=- -=-"
echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-" echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-"
echo -e "-=- -=-" echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-" echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-" echo -e "-=- -=-"
echo -e "-========================================================================-" echo -e "-========================================================================-"
conv_date() { source ./functions/.functions
echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
}
conv_date_alt() {
echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
MEETING_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed 's/.* //')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
}
set_agenda_url() {
case "$1" in
'"Agenda (HTML)"')
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda (PDF)"')
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (HTML)"')
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (PDF)"')
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (HTML)"')
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (PDF)"')
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes with Attachments (PDF)"')
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (HTML)"')
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (PDF)"')
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (HTML)"')
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (PDF)"')
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (HTML)"')
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (PDF)"')
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (HTML)"')
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (PDF)"')
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
esac
}
clear_agenda_url() {
AGENDA_HTML_URL=""
AGENDA_PDF_URL=""
AGENDA_REVISE_HTML_URL=""
AGENDA_REVISE_PDF_URL=""
MINUTES_HTML_URL=""
MINUTES_PDF_URL=""
MINUTES_ATTACH_PDF_URL=""
AGENDA_FULL_HTML_URL=""
AGENDA_FULL_PDF_URL=""
AGENDA_COVER_HTML_URL=""
AGENDA_COVER_PDF_URL=""
AGENDA_POST_HTML_URL=""
AGENDA_POST_PDF_URL=""
ADDENDUM_HTML_URL=""
ADDENDUM_PDF_URL=""
}
download_agendas() {
if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then
if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then
echo "Saving revised agenda as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$1/Agenda_Revised.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_PDF_URL != "" ]]; then
echo "Saving regular agenda as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$1/Agenda.pdf" -N -q #--show-progress
fi
elif [[ $AGENDA_REVISE_HTML_URL != "" ]] || [[ $AGENDA_HTML_URL != "" ]]; then
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
echo "Saving revised agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$1/Agenda_Revised.html" -N -q #--show-progress
fi
if [[ $AGENDA_HTML_URL != "" ]]; then
echo "Saving regular agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$1/Agenda.html" -N -q #--show-progress
fi
elif [[ $AGENDA_FULL_PDF_URL != "" ]] || [[ $AGENDA_FULL_HTML_URL != "" ]]; then
if [[ $AGENDA_FULL_PDF_URL != "" ]]; then
echo "Saving full package agenda as PDF... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_PDF_URL" -O "$1/Agenda_FullPackage.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_FULL_HTML_URL != "" ]]; then
echo "Saving full package agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O "$1/Agenda_FullPackage.html" -N -q #--show-progress
fi
elif [[ $AGENDA_POST_PDF_URL != "" ]] || [[ $AGENDA_POST_HTML_URL != "" ]]; then
if [[ $AGENDA_POST_PDF_URL != "" ]]; then
echo "Saving post agenda as HTML... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_PDF_URL" -O "$1/Agenda_Post.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_POST_HTML_URL != "" ]]; then
echo "Saving post agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O "$1/Agenda_Post.html" -N -q #--show-progress
fi
fi
if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then
if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then
echo "Saving minutes with attachments as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$1/Minutes_With_Attachments.pdf" -N -q #--show-progress
fi
if [[ $MINUTES_PDF_URL != "" ]]; then
echo "Saving minutes as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$1/Minutes.pdf" -N -q #--show-progress
fi
else
if [[ $MINUTES_HTML_URL != "" ]]; then
echo "Saving minutes as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$1/Minutes.html" -N -q #--show-progress
fi
fi
if [[ $AGENDA_COVER_PDF_URL != "" ]]; then
echo "Saving cover agenda as PDF... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_PDF_URL" -O "$1/Agenda_Cover.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_COVER_HTML_URL != "" ]]; then
echo "Saving cover agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O "$1/Agenda_Cover.html" -N -q #--show-progress
fi
if [[ $ADDENDUM_PDF_URL != "" ]]; then
echo "Saving addendum as PDF... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_PDF_URL" -O "$1/Addendum.pdf" -N -q #--show-progress
fi
if [[ $ADDENDUM_HTML_URL != "" ]]; then
echo "Saving addendum as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O "$1/Addendum.html" -N -q #--show-progress
fi
}
# Warning to all who read this script: # Warning to all who read this script:
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works. # It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
@ -196,12 +24,12 @@ ADDENDUM_HTML="./tmp/addendum.html"
current_year=$(date +%Y) current_year=$(date +%Y)
current_month=$(date +%m) current_month=$(date +%m)
current_day=$(date +%d)00 current_day=$(date +%d)
SUPPORT_PAST="FALSE" SUPPORT_PAST=""
if [ -d "$TEMP_DIR" ]; then if [ -d "$TEMP_DIR" ]; then
rm -r $TEMP_DIR rm -r $TEMP_DIR
fi fi
rm -f $INDEX_PAGE rm -f $INDEX_PAGE
rm -f $SEARCH_PAGE rm -f $SEARCH_PAGE
@ -209,215 +37,211 @@ rm -f $AGENDA_HTML
mkdir $TEMP_DIR mkdir $TEMP_DIR
while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do while IFS="," read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g') INDEX_URL=$(echo "$INDEX_URL_PRE" | sed 's/\"//g' | sed 's/,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') CITY_ARCHIVE_NAME=$(echo "$CITY_ARCHIVE_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') CALENDAR_NAME=$(echo "$CALENDAR_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
INDEX_END="FALSE"
INDEX_END="FALSE" while [[ $INDEX_END == "FALSE" ]]; do
while [[ $INDEX_END == "FALSE" ]]; do echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
echo "SCRAPE_ESCRIBE: Downloading eScribe index..." wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --no-hsts --show-progress
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress if [ $? -ne 8 ]; then
if [ $? -ne 8 ]; then
FOUNDLIST="FALSE" FOUNDLIST="FALSE"
while IFS= read -r LINE; do while IFS= read -r LINE; do
if [[ "TRUE" == $FOUNDLIST ]]; then if [[ "TRUE" == $FOUNDLIST ]]; then
GREPENDLIST=$(echo $LINE | grep '<option ') GREPENDLIST=$(echo $LINE | grep '<option ')
if [[ "$GREPENDLIST" == "" ]]; then if [[ "$GREPENDLIST" == "" ]]; then
echo "SCRAPE_ESCRIBE: End of list." echo "SCRAPE_ESCRIBE: End of list."
INDEX_END="TRUE" INDEX_END="TRUE"
break break
else else
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g') MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
echo "-========================================================================-" echo "-========================================================================-"
echo "- $MEETING_NAME" echo "- $MEETING_NAME"
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //') MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
echo "- Corrected to: $MEETING_NAME" echo "- Corrected to: $MEETING_NAME"
fi fi
# Pages start at 1. Ew. # Pages start at 1. Ew.
x=1 x=1
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json" curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . >"${TEMP_DIR}escribe.json"
#cat "${TEMP_DIR}escribe.json" > debug.json #cat "${TEMP_DIR}escribe.json" > debug.json
y=0 y=0
i=0 i=0
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount') NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
while (true); do while (true); do
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length' ) NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length')
if [[ "$NUM_IN_JSON" == "" ]]; then if [[ "$NUM_IN_JSON" == "" ]]; then
break break
fi fi
# Decrease in the meeting count == we're on the final page. # Decrease in the meeting count == we're on the final page.
if (( $i >= $NUM_IN_JSON )) && (( 10#$NUM_IN_JSON >= 50)); then if (($i >= $NUM_IN_JSON)) && ((10#$NUM_IN_JSON >= 50)); then
((x++)) ((x++))
i=0 i=0
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json" curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . >"${TEMP_DIR}escribe.json"
elif (( $i >= 10#$NUM_IN_JSON )); then elif (($i >= 10#$NUM_IN_JSON)); then
break break
fi fi
echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x" echo "$(($i + 1)) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
# Boost speed by extracting a single meeting from the large JSON, then working on the extract. # Boost speed by extracting a single meeting from the large JSON, then working on the extract.
# No need to cat the entire file every time. # No need to cat the entire file every time.
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' > "${TEMP_DIR}escribe_short.json" cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' >"${TEMP_DIR}escribe_short.json"
#echo "> Meeting ID" #echo "> Meeting ID"
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id' #cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
#echo "> Meeting Attachments" #echo "> Meeting Attachments"
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length') NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script. # Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
clear_agenda_url clear_agenda_url
for ((j=0; j<=(( $NUM_ATTACHMENTS - 1 )); j++)); do for ((j = 0; j <= (($NUM_ATTACHMENTS - 1)); j++)); do
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')" set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
done done
# "25 Feb 2026" _time_parse_helper "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
if [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
echo "Alternate date format."
conv_date_alt "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
# "Feb 25 2026"
elif [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
echo "Standard date format."
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
else
echo "COULD NOT FIGURE OUT DATE FORMAT!"
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
fi
INPAST="" INPAST=""
if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then if ((10#$ITEM_YEAR >= 10#$current_year)) && ((10#$ITEM_MONTH >= $((10#$current_month - 1)))); then
echo "NAME : $MEETING_NAME" echo "NAME : $MEETING_NAME"
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY" echo "DATE : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
echo "A (H) : $AGENDA_HTML_URL" echo "A (H) : $AGENDA_HTML_URL"
echo "A (P) : $AGENDA_PDF_URL" echo "A (P) : $AGENDA_PDF_URL"
echo "AR(H) : $AGENDA_REVISE_HTML_URL" echo "AR(H) : $AGENDA_REVISE_HTML_URL"
echo "AR(P) : $AGENDA_REVISE_PDF_URL" echo "AR(P) : $AGENDA_REVISE_PDF_URL"
echo "AF(H) : $AGENDA_FULL_HTML_URL" echo "AF(H) : $AGENDA_FULL_HTML_URL"
echo "AF(P) : $AGENDA_FULL_PDF_URL" echo "AF(P) : $AGENDA_FULL_PDF_URL"
echo "AC(H) : $AGENDA_COVER_HTML_URL" echo "AC(H) : $AGENDA_COVER_HTML_URL"
echo "AC(P) : $AGENDA_COVER_PDF_URL" echo "AC(P) : $AGENDA_COVER_PDF_URL"
echo "AP(H) : $AGENDA_POST_HTML_URL" echo "AP(H) : $AGENDA_POST_HTML_URL"
echo "AP(P) : $AGENDA_POST_PDF_URL" echo "AP(P) : $AGENDA_POST_PDF_URL"
echo "M (H) : $MINUTES_HTML_URL" echo "M (H) : $MINUTES_HTML_URL"
echo "M (P) : $MINUTES_PDF_URL" echo "M (P) : $MINUTES_PDF_URL"
echo "MA(P) : $MINUTES_ATTACH_PDF_URL" echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
echo "AD(H) : $ADDENDUM_HTML_URL" echo "AD(H) : $ADDENDUM_HTML_URL"
echo "AD(P) : $ADDENDUM_PDF_URL" echo "AD(P) : $ADDENDUM_PDF_URL"
else else
echo "Dates are in the past!" echo "Dates are in the past!"
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY" echo "DATE : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
INPAST="TRUE" INPAST="TRUE"
fi fi
# I think "break" broke when I did nested loops. idk I'm too drunk for this. # I think "break" broke when I did nested loops. idk I'm too drunk for this.
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
echo "Abort." echo "Abort."
break break
fi fi
#echo "> Meeting Video" #echo "> Meeting Video"
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo' #cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')" VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
ERROR="FALSE" ERROR="FALSE"
ADDENDUM_ERROR="FALSE" ADDENDUM_ERROR="FALSE"
echo "Downloading agenda HTML..." echo "Downloading agenda HTML..."
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_FULL_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_POST_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
else
ERROR="TRUE"
fi
if [[ $ADDENDUM_HTML_URL != "" ]]; then if [[ -n $AGENDA_REVISE_HTML_URL ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress _utils_download_helper "$AGENDA_REVISE_HTML_URL" "$AGENDA_HTML"
else elif [[ -n $AGENDA_HTML_URL ]]; then
ADDENDUM_ERROR="TRUE" _utils_download_helper "$AGENDA_HTML_URL" "$AGENDA_HTML"
fi
if [[ "$ERROR" == "FALSE" ]]; then elif [[ -n $AGENDA_FULL_HTML_URL ]]; then
_utils_download_helper "$AGENDA_FULL_HTML_URL" "$AGENDA_HTML"
mkdir "./$CITY_ARCHIVE_NAME" elif [[ -n $AGENDA_POST_HTML_URL ]]; then
mkdir "./$CITY_ARCHIVE_NAME/Meetings" _utils_download_helper "$AGENDA_POST_HTML_URL" "$AGENDA_HTML"
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then elif [[ -n $AGENDA_COVER_HTML_URL ]]; then
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/" _utils_download_helper "$AGENDA_COVER_HTML_URL" "$AGENDA_HTML"
fi else
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then ERROR="TRUE"
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR/" fi
fi
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY")
if [ ! -d "$MEETING_DIR" ]; then
mkdir "$MEETING_DIR/"
fi
if [ ! -d "$MEETING_DIR/Attachments" ]; then
mkdir "$MEETING_DIR/Attachments/"
fi
if [[ $VIDEO_URL != "" ]]; then if [[ -n $ADDENDUM_HTML_URL ]]; then
echo "Saving recording URL..." _utils_download_helper "$ADDENDUM_HTML_URL" "$ADDENDUM_HTML"
echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt" else
fi ADDENDUM_ERROR="TRUE"
fi
# Get attachment links if [[ "$ERROR" == "FALSE" ]]; then
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
# Get attachment names
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
# Get attachment links
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
# Get attachment names
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
fi
# Download attachment and use the name grabbed above
echo "Found the following agenda attachments:"
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
echo "- $LINEA2"
wget --no-check-certificate --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -N -q #--show-progress
done < ./tmp/attachment_urls 3< ./tmp/attachment_names
echo "All attachments saved."
download_agendas "$MEETING_DIR" mkdir "./$CITY_ARCHIVE_NAME"
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
echo "dir not empty" >> /dev/null mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
else fi
rm -r "$MEETING_DIR/Attachments" if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$ITEM_YEAR" ]; then
fi mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$ITEM_YEAR/"
fi
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$ITEM_YEAR" "$ITEM_MONTH" "$ITEM_DAY")
if [ ! -d "$MEETING_DIR" ]; then
mkdir "$MEETING_DIR/"
fi
if [ ! -d "$MEETING_DIR/Attachments" ]; then
mkdir "$MEETING_DIR/Attachments/"
fi
echo "All files from this meeting have been saved." if [[ $VIDEO_URL != "" ]]; then
fi echo "Saving recording URL..."
echo "https://video.isilive.ca/london/"$VIDEO_URL >"$MEETING_DIR/RecordingLink.txt"
fi
((i++)) # Get attachment links
((y++)) cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' >"./tmp/attachment_urls"
# Get attachment names
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' >"./tmp/attachment_names"
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
# Get attachment links
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' >"./tmp/attachment_urls"
# Get attachment names
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' >"./tmp/attachment_names"
fi
# Download attachment and use the name grabbed above
echo "Found the following agenda attachments:"
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
echo "- $LINEA2 / $LINEA1"
_utils_download_helper "$INDEX_URL$LINEA1" "$MEETING_DIR/Attachments/$LINEA2"
# [ ! -s "$MEETING_DIR/Attachments/$LINEA2" ] && rm -f "$MEETING_DIR/Attachments/$LINEA2"
done < ./tmp/attachment_urls 3<./tmp/attachment_names
echo "All attachments saved."
download_agendas "$MEETING_DIR"
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
echo "dir not empty" >>/dev/null
else
rm -r "$MEETING_DIR/Attachments"
fi
echo "All files from this meeting have been saved."
find "$MEETING_DIR" -type f -size 0 -delete
echo "Cleaning PDFs for archive.org..."
find "$MEETING_DIR" -type f -name '*.pdf' -print0 | xargs -0 -n1 qpdf --replace-input
# qpdf repairs and leaves garbage original PDFs
find "$MEETING_DIR" -type f -name '*~qpdf-orig' -delete -print
fi
((i++))
((y++))
done done
fi fi
fi fi
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"') GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
if [[ "$GREPLIST" != "" ]]; then if [[ "$GREPLIST" != "" ]]; then
echo "SCRAPE_ESCRIBE: Found meeting type list." echo "SCRAPE_ESCRIBE: Found meeting type list."
FOUNDLIST="TRUE" FOUNDLIST="TRUE"
fi fi
done < $INDEX_PAGE done < $INDEX_PAGE
else else
INDEX_END="TRUE" INDEX_END="TRUE"
echo "SCRAPE_ESCRIBE: Couldn't save index!" echo "SCRAPE_ESCRIBE: Couldn't save index!"
fi fi
done done
done < websites.csv done < websites.csv

47
SCRAPE_MPaS.SH Executable file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_mpas.html"
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://london.ca/government/council-civic-administration/master-plans-strategies/plans-strategies"
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
cat "$SEARCH_PAGE" | sed 's/></>\n</g' | \
while IFS= read -r LINE; do
if (( LAST_LINE_SUMMARY )) && [[ "$LINE" == "<span>"* ]]; then
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<span>\([^<]*\)<\/span>.*/\1/')
echo $CURRENT
fi
LAST_LINE_SUMMARY=0
if echo "$LINE" | grep -q '<summary>'; then
LAST_LINE_SUMMARY=1
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<summary>\([^<]*\)<\/summary>.*/\1/')
echo $CURRENT
fi
if [[ "$LINE" == *".pdf"* ]]; then
FILE="$(echo "$LINE" | grep -o 'href="[^"]*\.pdf"' | sed 's/^href="//; s/"$//; s#^https://london\.ca##' | sed 's/%20/ /g' | sed 's/%27//g')" # Fix stupid sublime syntax highlighting: '
echo $FILE
mkdir -p "./LondonArchive/Master Plans and Strategies/$CURRENT/"
_utils_download_helper "https://london.ca$FILE" "./LondonArchive/Master Plans and Strategies/$CURRENT/$(basename "$FILE")"
fi
done
fi

44
SCRAPE_OPEN.SH Normal file → Executable file
View File

@ -8,15 +8,19 @@ echo -e "-=- Lillian Skinner
echo -e "-=- -=-" echo -e "-=- -=-"
echo -e "-========================================================================-" echo -e "-========================================================================-"
source ./functions/.functions
WORKDIR="./tmp" WORKDIR="./tmp"
STAGEDIR="./staging" STAGEDIR="./staging"
DOCDIR="./LondonArchive_OpenData" DOCDIR="./LondonArchive/OpenData"
MAPDIR="./LondonArchive_OpenData/Maps" MAPDIR="./LondonArchive/OpenData/Maps"
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
mkdir $WORKDIR mkdir -p $WORKDIR
mkdir $DOCDIR mkdir -p $DOCDIR
mkdir $MAPDIR mkdir -p $MAPDIR
DOWNLOAD_MAPS=0
i=0 i=0
SEARCH_END=0 SEARCH_END=0
@ -50,18 +54,25 @@ while [[ $SEARCH_END == 0 ]]; do
echo " Cur. article: $i.$j, URL : $ITEM_URL" echo " Cur. article: $i.$j, URL : $ITEM_URL"
echo " Cur. article: $i.$j, Name : $ITEM_NAME" echo " Cur. article: $i.$j, Name : $ITEM_NAME"
rm -rf $STAGEDIR #rm -rf $STAGEDIR
mkdir $STAGEDIR #mkdir $STAGEDIR
if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$STAGEDIR/$ITEM_NAME" -c -q _utils_download_helper "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" "$DOCDIR/$ITEM_NAME"
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$DOCDIR/$ITEM_NAME" -c -q
echo " Downloaded." echo " Downloaded."
echo "Compressing." echo "(Not) Compressing."
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR" # No need to compress non-map data.
#7z a "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR"
fi fi
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]]; then # This section is depracated. Use SCRAPE_AGIS.SH instead.
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]] && (( DOWNLOAD_MAPS )); then
MAPDIR_ITEM=$(echo "$MAPDIR/$ITEM_TITLE")
mkdir -p "$MAPDIR_ITEM"
echo "Item: $MAPDIR_ITEM"
MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')" MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')"
echo " ^^^ Item is map. ($MAP_ID) " echo " ^^^ Item is map. ($MAP_ID) "
# https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1 # https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1
@ -71,21 +82,22 @@ while [[ $SEARCH_END == 0 ]]; do
MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1" MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1"
MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1" MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1"
echo " Map URL (CSV) : $MAP_CSV" echo " Map URL (CSV) : $MAP_CSV"
wget --user-agent="$WGET_UA" "$MAP_CSV" -O "$STAGEDIR/$ITEM_TITLE.csv" -c -q _utils_download_helper "$MAP_CSV" "$MAPDIR_ITEM/$ITEM_TITLE.csv"
echo " Downloaded." echo " Downloaded."
echo " Map URL (Shapefile): $MAP_SHP" echo " Map URL (Shapefile): $MAP_SHP"
wget --user-agent="$WGET_UA" "$MAP_SHP" -O "$STAGEDIR/$ITEM_TITLE.shp" -c -q _utils_download_helper "$MAP_SHP" "$MAPDIR_ITEM/$ITEM_TITLE.shp"
echo " Downloaded." echo " Downloaded."
echo " Map URL (GeoJSON) : $MAP_GEO" echo " Map URL (GeoJSON) : $MAP_GEO"
wget --user-agent="$WGET_UA" "$MAP_GEO" -O "$STAGEDIR/$ITEM_TITLE.geojson" -c -q _utils_download_helper "$MAP_GEO" "$MAPDIR_ITEM/$ITEM_TITLE.geojson"
echo " Downloaded." echo " Downloaded."
echo " Map URL (KML) : $MAP_KML" echo " Map URL (KML) : $MAP_KML"
wget --user-agent="$WGET_UA" "$MAP_KML" -O "$STAGEDIR/$ITEM_TITLE.kml" -c -q _utils_download_helper "$MAP_KML" "$MAPDIR_ITEM/$ITEM_TITLE.kml"
echo " Downloaded." echo " Downloaded."
echo ' Source URL is $ITEM_URL.' echo ' Source URL is $ITEM_URL.'
echo "Compressing." echo "Compressing."
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$MAPDIR/$ITEM_TITLE.7z" "$STAGEDIR" rm -f "$MAPDIR_ITEM/$ITEM_TITLE.7z"
7z a "$MAPDIR_ITEM/$ITEM_TITLE.7z" "$MAPDIR_ITEM"
fi fi
done done

78
SCRAPE_PLAN.SH Normal file → Executable file
View File

@ -7,49 +7,7 @@ echo -e "-=- Lillian Skinner
echo -e "-=- -=-" echo -e "-=- -=-"
echo -e "-========================================================================-" echo -e "-========================================================================-"
conv_date_plan() { source ./functions/.functions
PROJECT_TIME_YEAR=$(echo $1 | sed 's/.*\([0-9]\{4\}\).*/\1/p' | uniq)
PROJECT_TIME_MONTH_WORD=$(echo $1 | sed 's/.*,\s*\([A-Za-z]*\)\s[0-9]\{1,2\},.*/\1/p' | uniq)
PROJECT_TIME_DAY_SHORT=$(echo $1 | sed 's/.*,\s*[A-Za-z]*\s\([0-9]\{1,2\}\),.*/\1/p' | uniq)
PROJECT_TIME_DAY=$(printf "%02d" $PROJECT_TIME_DAY_SHORT)
case "$PROJECT_TIME_MONTH_WORD" in
Jan*) PROJECT_TIME_MONTH="01" ;;
Feb*) PROJECT_TIME_MONTH="02" ;;
Mar*) PROJECT_TIME_MONTH="03" ;;
Apr*) PROJECT_TIME_MONTH="04" ;;
May) PROJECT_TIME_MONTH="05" ;;
Jun*) PROJECT_TIME_MONTH="06" ;;
Jul*) PROJECT_TIME_MONTH="07" ;;
Aug*) PROJECT_TIME_MONTH="08" ;;
Sep*) PROJECT_TIME_MONTH="09" ;;
Oct*) PROJECT_TIME_MONTH="10" ;;
Nov*) PROJECT_TIME_MONTH="11" ;;
Dec*) PROJECT_TIME_MONTH="12" ;;
*) PROJECT_TIME_MONTH="--" ;;
esac
}
conv_date() {
MODIFIED_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MODIFIED_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MODIFIED_DAY=$(printf "%02d" $MODIFIED_DAY_SHORT)
MODIFIED_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MODIFIED_MONTH_WORD" in
Jan*) MODIFIED_MONTH="01" ;;
Feb*) MODIFIED_MONTH="02" ;;
Mar*) MODIFIED_MONTH="03" ;;
Apr*) MODIFIED_MONTH="04" ;;
May) MODIFIED_MONTH="05" ;;
Jun*) MODIFIED_MONTH="06" ;;
Jul*) MODIFIED_MONTH="07" ;;
Aug*) MODIFIED_MONTH="08" ;;
Sep*) MODIFIED_MONTH="09" ;;
Oct*) MODIFIED_MONTH="10" ;;
Nov*) MODIFIED_MONTH="11" ;;
Dec*) MODIFIED_MONTH="12" ;;
*) MODIFIED_MONTH="--" ;;
esac
}
# Warning to all who read this script: # Warning to all who read this script:
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works. # It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
@ -81,8 +39,8 @@ mkdir $TEMP_DIR
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications" SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
j=0 j=0
SEARCH_END="FALSE" SEARCH_END=0
while [[ $SEARCH_END == "FALSE" ]]; do while (( ! SEARCH_END )); do
echo "-========================================================================-" echo "-========================================================================-"
echo "Downloading search results... Page $j" echo "Downloading search results... Page $j"
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
@ -107,11 +65,11 @@ while [[ $SEARCH_END == "FALSE" ]]; do
PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&amp;/\&/g' | sed 's/&#039;/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-') PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&amp;/\&/g' | sed 's/&#039;/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-')
echo " Found project: $PROJECT_NAME" echo " Found project: $PROJECT_NAME"
MODIFIED_MONTH="" ITEM_MONTH=""
MODIFIED_YEAR="" ITEM_YEAR=""
conv_date "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')" _time_parse_helper "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
if (( 10#$MODIFIED_YEAR >= 10#$current_year )) && (( 10#$MODIFIED_MONTH >= $((10#$current_month - 1)) )); then if (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
echo "Last Modified: $MODIFIED_YEAR/$MODIFIED_MONTH/$MODIFIED_DAY" echo "Last Modified: $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
else else
echo "Dates are in the past! Abort." echo "Dates are in the past! Abort."
break break
@ -124,8 +82,8 @@ while [[ $SEARCH_END == "FALSE" ]]; do
rm -f $PROJECT_IMAGE_NAMES rm -f $PROJECT_IMAGE_NAMES
while IFS= read -r PLINE; do while IFS= read -r PLINE; do
if [[ "$NEXT_LINE_FITEM" == "TRUE" ]]; then if (( NEXT_LINE_FITEM )); then
NEXT_LINE_FITEM="FALSE" NEXT_LINE_FITEM=0
# Is this line an actual item? # Is this line an actual item?
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items") PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
@ -208,15 +166,15 @@ while [[ $SEARCH_END == "FALSE" ]]; do
fi fi
PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime") PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime")
if [[ $PROJECT_FOUND_TIME != "" ]]; then if [[ $PROJECT_FOUND_TIME != "" ]]; then
conv_date_plan "$PLINE" _time_parse_helper "$(echo $PLINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g' | cut -d, -f2- | cut -d\ -f2-)"
echo "Found date : $PROJECT_TIME_YEAR/$PROJECT_TIME_MONTH/$PROJECT_TIME_DAY" echo "Found date : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
fi fi
fi fi
fi fi
fi fi
if [[ "$NEXT_LINE_IMAGE" == "TRUE" ]]; then if (( NEXT_LINE_IMAGE )); then
NEXT_LINE_IMAGE="FALSE" NEXT_LINE_IMAGE=0
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq) PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca") PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
@ -243,14 +201,14 @@ while [[ $SEARCH_END == "FALSE" ]]; do
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label") PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq) PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
NEXT_LINE_FITEM="TRUE" NEXT_LINE_FITEM=1
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol) # Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
# We're setting a flag to let the script know if an upcoming line is contents. # We're setting a flag to let the script know if an upcoming line is contents.
fi fi
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image") PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
NEXT_LINE_IMAGE="TRUE" NEXT_LINE_IMAGE=1
# Same idea as before but for the image shown on the main page. # Same idea as before but for the image shown on the main page.
fi fi
@ -340,11 +298,11 @@ while [[ $SEARCH_END == "FALSE" ]]; do
fi fi
done < $SEARCH_PAGE done < $SEARCH_PAGE
else else
SEARCH_END="TRUE" SEARCH_END=1
echo "No more pages!" echo "No more pages!"
fi fi
else else
SEARCH_END="TRUE" SEARCH_END=1
echo "No more pages!" echo "No more pages!"
fi fi
((j++)) ((j++))

9
functions/.functions Normal file
View File

@ -0,0 +1,9 @@
sdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# General
source "$sdir/.functions.time"
source "$sdir/.functions.utils"
# Script specific
source "$sdir/.functions.filepro"
source "$sdir/.functions.escribe"

View File

@ -0,0 +1,133 @@
set_agenda_url() {
case "$1" in
'"Agenda (HTML)"')
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda (PDF)"')
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (HTML)"')
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (PDF)"')
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (HTML)"')
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (PDF)"')
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes with Attachments (PDF)"')
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (HTML)"')
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (PDF)"')
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (HTML)"')
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (PDF)"')
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (HTML)"')
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (PDF)"')
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (HTML)"')
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (PDF)"')
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
esac
}
clear_agenda_url() {
AGENDA_HTML_URL=""
AGENDA_PDF_URL=""
AGENDA_REVISE_HTML_URL=""
AGENDA_REVISE_PDF_URL=""
MINUTES_HTML_URL=""
MINUTES_PDF_URL=""
MINUTES_ATTACH_PDF_URL=""
AGENDA_FULL_HTML_URL=""
AGENDA_FULL_PDF_URL=""
AGENDA_COVER_HTML_URL=""
AGENDA_COVER_PDF_URL=""
AGENDA_POST_HTML_URL=""
AGENDA_POST_PDF_URL=""
ADDENDUM_HTML_URL=""
ADDENDUM_PDF_URL=""
}
download_agendas() {
local outdir="$1"
if [[ -n $AGENDA_REVISE_PDF_URL ]]; then
echo "Saving revised agenda as PDF..."
_utils_download_helper "$AGENDA_REVISE_PDF_URL" "$outdir/Agenda_Revised.pdf"
fi
if [[ -n $AGENDA_PDF_URL ]]; then
echo "Saving regular agenda as PDF..."
_utils_download_helper "$AGENDA_PDF_URL" "$outdir/Agenda.pdf"
fi
if [[ -z $AGENDA_REVISE_PDF_URL && -n $AGENDA_REVISE_HTML_URL ]]; then
echo "Saving revised agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_REVISE_HTML_URL" "$outdir/Agenda_Revised.html"
fi
if [[ -z $AGENDA_PDF_URL && -n $AGENDA_HTML_URL ]]; then
echo "Saving regular agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_HTML_URL" "$outdir/Agenda.html"
fi
if [[ -n $AGENDA_FULL_PDF_URL ]]; then
echo "Saving full package agenda as PDF... (no HTML found!)"
_utils_download_helper "$AGENDA_FULL_PDF_URL" "$outdir/Agenda_FullPackage.pdf"
fi
if [[ -z $AGENDA_FULL_PDF_URL && -n $AGENDA_FULL_HTML_URL ]]; then
echo "Saving full package agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_FULL_HTML_URL" "$outdir/Agenda_FullPackage.html"
fi
if [[ -n $AGENDA_POST_PDF_URL ]]; then
echo "Saving post agenda as PDF..."
_utils_download_helper "$AGENDA_POST_PDF_URL" "$outdir/Agenda_Post.pdf"
fi
if [[ -z $AGENDA_POST_PDF_URL && -n $AGENDA_POST_HTML_URL ]]; then
echo "Saving post agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_POST_HTML_URL" "$outdir/Agenda_Post.html"
fi
if [[ -n $MINUTES_ATTACH_PDF_URL ]]; then
echo "Saving minutes with attachments as PDF..."
_utils_download_helper "$MINUTES_ATTACH_PDF_URL" "$outdir/Minutes_With_Attachments.pdf"
fi
if [[ -n $MINUTES_PDF_URL ]]; then
echo "Saving minutes as PDF..."
_utils_download_helper "$MINUTES_PDF_URL" "$outdir/Minutes.pdf"
fi
if [[ -z $MINUTES_ATTACH_PDF_URL && -z $MINUTES_PDF_URL && -n $MINUTES_HTML_URL ]]; then
echo "Saving minutes as HTML... (no PDF found!)"
_utils_download_helper "$MINUTES_HTML_URL" "$outdir/Minutes.html"
fi
if [[ -n $AGENDA_COVER_PDF_URL ]]; then
echo "Saving cover agenda as PDF... (no HTML found!)"
_utils_download_helper "$AGENDA_COVER_PDF_URL" "$outdir/Agenda_Cover.pdf"
fi
if [[ -z $AGENDA_COVER_PDF_URL && -n $AGENDA_COVER_HTML_URL ]]; then
echo "Saving cover agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_COVER_HTML_URL" "$outdir/Agenda_Cover.html"
fi
if [[ -n $ADDENDUM_PDF_URL ]]; then
echo "Saving addendum as PDF... (no HTML found!)"
_utils_download_helper "$ADDENDUM_PDF_URL" "$outdir/Addendum.pdf"
fi
if [[ -z $ADDENDUM_PDF_URL && -n $ADDENDUM_HTML_URL ]]; then
echo "Saving addendum as HTML... (no PDF found!)"
_utils_download_helper "$ADDENDUM_HTML_URL" "$outdir/Addendum.html"
fi
}

View File

@ -0,0 +1,34 @@
_filepro_download_folder() {
if [ "$#" -eq 0 ]; then
echo "Usage: <input string>"
exit 1
fi
local tmp_index
tmp_index=$(mktemp)
local tmp_dir
tmp_dir="$1"
local LINE
local LINE_ID
local LINE_TITLE
local LINE_TYPE
wget --no-check-certificate --user-agent="$WGET_UA" "$2" -O "$tmp_index" --no-hsts -q
echo "Looking in folder $3/$LINE_ID"
echo "Download to $tmp_dir/"
while IFS= read -r LINE; do
LINE_ID=$(echo $LINE | sed 's/.*data-id="\([^"]*\)".*/\1/g')
LINE_TITLE=$(echo $LINE | sed 's/.*data-title="\([^"]*\)".*/\1/g' | sed 's/&amp;/\&/g' | sed 's/&#039;/'\''/g' | sed 's/&#39;/'\''/g')
LINE_TYPE=$(echo $LINE | sed 's/.*data-type="\([^"]*\)".*/\1/g')
if [[ "$LINE_TYPE" == "document" ]]; then
echo "Found document: $LINE_ID : $LINE_TITLE.pdf... downloading..."
mkdir -p "$tmp_dir"
_utils_download_helper "${START_URL}/document/$LINE_ID" "$tmp_dir/$LINE_TITLE.pdf"
elif [[ "$LINE_TYPE" == "folder" ]]; then
_filepro_download_folder "$tmp_dir/$LINE_TITLE" "${START_URL}/filepro/documents/$LINE_ID" "$3/$LINE_ID"
fi
done < "$tmp_index"
rm -f $tmp_index
}

71
functions/.functions.time Normal file
View File

@ -0,0 +1,71 @@
_time_parse_helper() {
if [ "$#" -eq 0 ]; then
echo "Usage: <date>"
exit 1
fi
echo $1
if [[ "$(echo $1 | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
_time_parse_ddmonyyyy "$1"
elif [[ "$(echo $1 | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
_time_parse_monddyyyy "$1"
else
echo "COULD NOT FIGURE OUT DATE FORMAT!"
return 1
fi
}
_time_parse_monddyyyy() {
if [ "$#" -eq 0 ]; then
echo "Usage: <date in mon dd yyyy>"
exit 1
fi
ITEM_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
ITEM_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
ITEM_DAY=$(printf "%02d" $ITEM_DAY_SHORT)
ITEM_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$ITEM_MONTH_WORD" in
Jan*) ITEM_MONTH="01" ;;
Feb*) ITEM_MONTH="02" ;;
Mar*) ITEM_MONTH="03" ;;
Apr*) ITEM_MONTH="04" ;;
May) ITEM_MONTH="05" ;;
Jun*) ITEM_MONTH="06" ;;
Jul*) ITEM_MONTH="07" ;;
Aug*) ITEM_MONTH="08" ;;
Sep*) ITEM_MONTH="09" ;;
Oct*) ITEM_MONTH="10" ;;
Nov*) ITEM_MONTH="11" ;;
Dec*) ITEM_MONTH="12" ;;
*) ITEM_MONTH="--" ;;
esac
}
_time_parse_ddmonyyyy() {
if [ "$#" -eq 0 ]; then
echo "Usage: <date in dd mon yyyy>"
exit 1
fi
ITEM_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
ITEM_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
ITEM_DAY=$(printf "%02d" $ITEM_DAY_SHORT)
ITEM_YEAR=$(echo "$1" | sed 's/.* //')
case "$ITEM_MONTH_WORD" in
Jan*) ITEM_MONTH="01" ;;
Feb*) ITEM_MONTH="02" ;;
Mar*) ITEM_MONTH="03" ;;
Apr*) ITEM_MONTH="04" ;;
May) ITEM_MONTH="05" ;;
Jun*) ITEM_MONTH="06" ;;
Jul*) ITEM_MONTH="07" ;;
Aug*) ITEM_MONTH="08" ;;
Sep*) ITEM_MONTH="09" ;;
Oct*) ITEM_MONTH="10" ;;
Nov*) ITEM_MONTH="11" ;;
Dec*) ITEM_MONTH="12" ;;
*) ITEM_MONTH="--" ;;
esac
}

104
functions/.functions.utils Normal file
View File

@ -0,0 +1,104 @@
_utils_ocrmypdf() {
if [ "$#" -eq 0 ]; then
echo "Usage: <in.pdf> <out.pdf>"
exit 1
fi
# https://stackoverflow.com/questions/7997399/bash-script-to-check-pdfs-are-ocrd
MYFONTS=$(pdffonts -l 5 "$1" | tail -n +3 | cut -d' ' -f1 | sort | uniq)
if [ "$MYFONTS" = '' ] || [ "$MYFONTS" = '[none]' ]; then
echo "NOT OCRed yet. Working..."
else
echo "$1 is already OCRed. Saving as is."
cp "$1" "$2"
exit 0
fi
in="$1"
out="$2"
tmp=$(mktemp -d) || return 1
pdfseparate "$in" "$tmp/page-%04d.pdf" || return 1
i=0
for page in "$tmp"/page-*.pdf; do
img="$tmp/img-$i.png"
qpdf --replace-input --rotate=0:1-z "$page"
pdftoppm -singlefile -r 300 -png -cropbox "$page" "$tmp/img-$i" || return 1
# Checks rotations. Annoying way to do it but whatever.
rotation=$(tesseract "$img" stdout --psm 0 2>/dev/null | awk -F': ' '/Rotate/ {print $2}')
case "$rotation" in
180) convert "$img" -rotate 180 "$img" ;;
90) convert "$img" -rotate 90 "$img" ;;
270) convert "$img" -rotate 270 "$img";;
esac
ocrmypdf \
--skip-text \
--clean \
--optimize 1 \
--jobs 1 \
"$img" "$tmp/ocr-$i-tmp.pdf" || return 1
case "$rotation" in
90) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
270) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
esac
mv "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf"
i=$((i+1))
done
pdfunite $(ls -v ${tmp}/ocr*.pdf) "$out" || return 1
}
_utils_fix_dashes() {
if [ "$#" -eq 0 ]; then
echo "Usage: <input string>"
exit 1
fi
perl -CSDA -MURI::Escape -MUnicode::Normalize -e '
binmode STDOUT, ":utf8";
my $s = shift // "";
my $prev;
do { $prev = $s; $s = uri_unescape($s); } while ($s ne $prev);
$s = NFKC($s);
$s =~ tr/\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}/-/;
$s =~ s/&/and/g;
$s =~ tr/\x{2018}\x{2019}\x{201B}/\x27/;
$s =~ tr/\x{201C}\x{201D}/"/;
$s =~ tr/\x{00A0}/ /;
$s =~ s/[\x{200B}\x{200C}\x{200D}\x{FEFF}]//g;
$s =~ s/\s+/ /g;
$s =~ s/^\s+|\s+$//g;
$s =~ s/\s+(\.[^. ]+)$/$1/;
print $s;
' "$1"
}
_utils_download_helper() {
if [ "$#" -eq 0 ]; then
echo "Usage: <url> <outfile>"
exit 1
fi
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
local url="$1"
local out="$2"
local code
code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
case "$code" in
200)
echo "Downloaded."
;;
304)
echo "Already exists! Skipping."
;;
*)
echo "FAILED! $code: $out | $url" >&2
rm -f "$out"
return 1
;;
esac
}

78
template/default.html Normal file
View File

@ -0,0 +1,78 @@
<meta charset="UTF-8">
<style>
body {
width: 90%;
min-width: 600px;
position: relative;
margin-left: auto;
margin-right: auto;
color: #666;
font-size: 16px;
font-family: Frutiger,"Helvetica Neue",Helvetica,Arial,sans-serif;
font-weight: 300;
}
strong {
font-weight: 700;
}
p {
color: #000000;
}
.h1, .h2, .h3, .h4, .h5, .h6, .post-teaser.featured .post-title, h1, h2, h3, h4, h5, h6 {
font-family: Gnuolane,"Helvetica Neue",Helvetica,Arial,sans-serif;
font-weight: 700;
line-height: 1.1;
color: #087ac0;
}
table {
display: table;
margin-bottom: 2em;
min-width: 100%;
border-spacing: 0;
border-collapse: collapse;
border-color: #ccc;
background-color: transparent;
line-height: 1.5;
}
.table-responsive {
overflow-x: auto;
}
table tbody {
display: table-row-group;
vertical-align: top;
border-color: inherit;
}
table tbody > tr:nth-of-type(2n+1) {
background-color: #f2f2f2;
}
table tr {
display: table-row;
vertical-align: inherit;
border-color: inherit;
}
table tbody > tr > td, table tbody > tr > th, table thead > tr > th {
padding: 8px;
border: 1px solid #ccc;
vertical-align: top;
}
table td {
display: table-cell;
border: 1px solid #ccc;
}
ol, ul {
margin-top: 0;
margin-bottom: 12px;
}
ol, ul {
box-sizing: border-box;
}
ol li {
padding-left: 10px;
}
ol li, ul li {
padding-bottom: 12px;
}
address, dd, dt, li, p {
line-height: 1.5;
}
</style>

View File

@ -0,0 +1,83 @@
<meta charset="UTF-8">
<style>
td, h1, h2, h3, p, b, div, i, span, label, ul, li, tr, table { page-break-inside: avoid; }
body {
width: 90%;
min-width: 600px;
position: relative;
margin-left: auto;
margin-right: auto;
color: #666;
font-size: 16px;
font-family: Frutiger,"Helvetica Neue",Helvetica,Arial,sans-serif;
font-weight: 300;
}
strong {
font-weight: 700;
}
p {
color: #000000;
}
.h1, .h2, .h3, .h4, .h5, .h6, .post-teaser.featured .post-title, h1, h2, h3, h4, h5, h6 {
font-family: Gnuolane,"Helvetica Neue",Helvetica,Arial,sans-serif;
font-weight: 700;
line-height: 1.1;
color: #087ac0;
}
table {
display: table;
margin-bottom: 2em;
min-width: 100%;
border-spacing: 0;
border-collapse: collapse;
border-color: #ccc;
background-color: transparent;
line-height: 1.5;
}
.table-responsive {
overflow-x: auto;
}
table tbody {
display: table-row-group;
vertical-align: top;
border-color: inherit;
}
table tbody > tr:nth-of-type(2n+1) {
background-color: #f2f2f2;
}
table tr {
display: table-row;
vertical-align: inherit;
border-color: inherit;
}
table tbody > tr > td, table tbody > tr > th, table thead > tr > th {
padding: 8px;
border: 1px solid #ccc;
vertical-align: top;
}
table td {
display: table-cell;
border: 1px solid #ccc;
}
ol, ul {
margin-top: 0;
margin-bottom: 12px;
}
ol, ul {
box-sizing: border-box;
}
ol li {
padding-left: 10px;
}
ol li, ul li {
padding-bottom: 12px;
}
address, dd, dt, li, p {
line-height: 1.5;
}
img {
max-width: 100% !important;
height: auto !important;
}
</style>

35
websites.csv Normal file → Executable file
View File

@ -1,34 +1,3 @@
"https://pub-brampton.escribemeetings.com/", "SubBramptonArchive", "" "https://pub-london.escribemeetings.com/", "LondonArchive", ""
"https://pub-markham.escribemeetings.com/", "SubMarkhamArchive", "" "https://pub-stthomas.escribemeetings.com/", "StThomasArchive", ""
"https://pub-cityofkingston.escribemeetings.com/", "SubKingstonArchive", ""
"https://pub-barrie.escribemeetings.com/", "SubBarrieArchive", ""
"https://pub-oshawa.escribemeetings.com/", "SubOshawaArchive", ""
"https://pub-ottawa.escribemeetings.com/", "OttawaArchive", "" "https://pub-ottawa.escribemeetings.com/", "OttawaArchive", ""
"https://pub-owensound.escribemeetings.com/", "SubOwenSoundArchive", ""
"https://pub-goderich.escribemeetings.com/", "SubGoderichArchive", ""
"https://pub-oakville.escribemeetings.com/", "SubOakvilleArchive", ""
"https://burlingtonpublishing.escribemeetings.com/", "SubBurlingtonArchive", ""
"https://pub-milton.escribemeetings.com/", "SubMiltonArchive", ""
"https://pub-durhamregion.escribemeetings.com/", "SubDurhamArchive", ""
"https://pub-richmondhill.escribemeetings.com/", "SubRichmondHillArchive", ""
"https://pub-whitby.escribemeetings.com/", "SubWhitbyArchive", ""
"https://pub-london.escribemeetings.com/", "LondonArchive", "London Meetings"
"https://pub-middlesexcounty.escribemeetings.com/", "SubMiddlesexCountyArchive", ""
"https://pub-lucanbiddulph.escribemeetings.com/", "SubLucanBiddulphArchive", ""
"https://pub-thamescentre.escribemeetings.com/", "SubThamesCentreArchive", ""
"https://pub-stthomas.escribemeetings.com/", "SubStThomasArchive", ""
"https://pub-northmiddlesex.escribemeetings.com/", "SubNorthMiddlesexArchive", ""
"https://pub-strathroy-caradoc.escribemeetings.com/", "SubStrathroyCaradocArchive", ""
"https://pub-adelaidemetcalfe.escribemeetings.com/", "SubAdelaideMetcalfeArchive", ""
"https://pub-middlesexcentre.escribemeetings.com/", "SubMiddsexCentreArchive", ""
"https://pub-mississauga.escribemeetings.com/", "SubMississaugaArchive", ""
"https://pub-guelph.escribemeetings.com/", "SubGuelphArchive", ""
"https://pub-regionofwaterloo.escribemeetings.com/", "SubWaterlooArchive", ""
"https://pub-kitchener.escribemeetings.com/", "SubKitchenerArchive", ""
"https://pub-hamilton.escribemeetings.com/", "SubHamiltonArchive", ""
"https://pub-brantford.escribemeetings.com/", "SubBrantfordArchive", ""
"https://pub-woodstock.escribemeetings.com/", "SubWoodstockArchive", ""
"https://pub-stratford.escribemeetings.com/", "SubStratfordArchive", ""
"https://pub-chatham-kent.escribemeetings.com/", "SubChathamKentArchive", ""
"https://pub-cambridge.escribemeetings.com/", "SubCambridgeArchive", ""
"https://pub-vaughan.escribemeetings.com/", "SubVaughanArchive", ""

1 https://pub-brampton.escribemeetings.com/ https://pub-london.escribemeetings.com/ SubBramptonArchive LondonArchive
2 https://pub-markham.escribemeetings.com/ https://pub-stthomas.escribemeetings.com/ SubMarkhamArchive StThomasArchive
https://pub-cityofkingston.escribemeetings.com/ SubKingstonArchive
https://pub-barrie.escribemeetings.com/ SubBarrieArchive
https://pub-oshawa.escribemeetings.com/ SubOshawaArchive
3 https://pub-ottawa.escribemeetings.com/ https://pub-ottawa.escribemeetings.com/ OttawaArchive OttawaArchive
https://pub-owensound.escribemeetings.com/ SubOwenSoundArchive
https://pub-goderich.escribemeetings.com/ SubGoderichArchive
https://pub-oakville.escribemeetings.com/ SubOakvilleArchive
https://burlingtonpublishing.escribemeetings.com/ SubBurlingtonArchive
https://pub-milton.escribemeetings.com/ SubMiltonArchive
https://pub-durhamregion.escribemeetings.com/ SubDurhamArchive
https://pub-richmondhill.escribemeetings.com/ SubRichmondHillArchive
https://pub-whitby.escribemeetings.com/ SubWhitbyArchive
https://pub-london.escribemeetings.com/ LondonArchive London Meetings
https://pub-middlesexcounty.escribemeetings.com/ SubMiddlesexCountyArchive
https://pub-lucanbiddulph.escribemeetings.com/ SubLucanBiddulphArchive
https://pub-thamescentre.escribemeetings.com/ SubThamesCentreArchive
https://pub-stthomas.escribemeetings.com/ SubStThomasArchive
https://pub-northmiddlesex.escribemeetings.com/ SubNorthMiddlesexArchive
https://pub-strathroy-caradoc.escribemeetings.com/ SubStrathroyCaradocArchive
https://pub-adelaidemetcalfe.escribemeetings.com/ SubAdelaideMetcalfeArchive
https://pub-middlesexcentre.escribemeetings.com/ SubMiddsexCentreArchive
https://pub-mississauga.escribemeetings.com/ SubMississaugaArchive
https://pub-guelph.escribemeetings.com/ SubGuelphArchive
https://pub-regionofwaterloo.escribemeetings.com/ SubWaterlooArchive
https://pub-kitchener.escribemeetings.com/ SubKitchenerArchive
https://pub-hamilton.escribemeetings.com/ SubHamiltonArchive
https://pub-brantford.escribemeetings.com/ SubBrantfordArchive
https://pub-woodstock.escribemeetings.com/ SubWoodstockArchive
https://pub-stratford.escribemeetings.com/ SubStratfordArchive
https://pub-chatham-kent.escribemeetings.com/ SubChathamKentArchive
https://pub-cambridge.escribemeetings.com/ SubCambridgeArchive
https://pub-vaughan.escribemeetings.com/ SubVaughanArchive