Separate functions

This commit is contained in:
Lillian Skinner 2026-06-19 23:30:51 -04:00
parent 3bce46e582
commit 16c4905b41
No known key found for this signature in database
GPG Key ID: 17F0E72D2C98B0A6
20 changed files with 1531 additions and 545 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
LondonArchive/
LondonScrapers_privdata/
tmp/
staging/

4
README.MD Normal file → Executable file
View File

@ -17,7 +17,7 @@ YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS!
## Scrape eScribe meetings (SCRAPE_MEET.SH)
This bash script will scrape meetings from the eScribe meetings platform.
This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST=1` (true), meetings older than 2 months will be downloaded. Otherwise, they will be skipped.
The basic structure of the output files is:
```
@ -76,7 +76,7 @@ The basic structure of the output files is:
This bash script will scrape LTC meetings from their wordpress site at: https://www.londontransit.ca/agendas-and-minutes/
Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low.
Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low. The HTML --> PDF conversion needs the template page included at `./template/default.html`.
The basic structure of the output files is:
```

66
SCRAPE_AGIS.SH Executable file
View File

@ -0,0 +1,66 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_AGIS.SH: Downloads ArcGIS maps -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
ARGIS_URL="https://maps.london.ca/server/rest/services"
TMP="./tmp"
TMP_STAGING="./tmp/layers"
SERVICELIST_JSON="$TMP/servicelist.json"
FOLDER_JSON="$TMP/folder.json"
SERVICE_JSON="$TMP/service.json"
LAYERQUERY_JSON="$TMP/layer_query.json"
mkdir "$TMP"
mkdir "$TMP_STAGING"
wget "$ARGIS_URL?f=json" --user-agent="$WGET_UA" -O "$SERVICELIST_JSON" -q
jq -r '.folders[]?' "$SERVICELIST_JSON" | while read -r FOLDER; do
wget "$ARGIS_URL/$FOLDER?f=json" --user-agent="$WGET_UA" -O "$FOLDER_JSON" -q
echo "Looking in $FOLDER"
jq -r '.services[]
| select(.type=="MapServer")
| .name' "$FOLDER_JSON" | while read -r SERVICE; do
echo "Found $SERVICE"
SERVICE_PATH="$FOLDER/$SERVICE"
echo "$ARGIS_URL/$SERVICE/MapServer"
wget "$ARGIS_URL/$SERVICE/MapServer?f=json" --user-agent="$WGET_UA" -O "$SERVICE_JSON" -q
mkdir -p "LondonArchive/ArcGIS/${SERVICE}"
jq -r '.layers[]? | "\(.id)|\(.name)"' "$SERVICE_JSON" | while IFS='|' read -r LAYERID LAYERNAME; do
rm -r "$TMP_STAGING"
mkdir "$TMP_STAGING"
LAYERNAME_CLEAN=$(echo $LAYERNAME | sed 's/\// /g' | sed 's/\\/ /g' | sed -E 's/ {2,}/ /g')
curl -s "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&returnCountOnly=true&f=json" -o "$TMP/count.json"
ITEM_COUNT=$(jq -r '.count' "$TMP/count.json")
MAX_REQUESTS=2000
i=0
j=0
while (( i <= ITEM_COUNT )); do
echo "Downloading $LAYERID-${j} $LAYERNAME_CLEAN"
echo "$i of $ITEM_COUNT"
_utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=geojson" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.geojson"
echo "Done GeoJSON!"
_utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=kmz" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.kmz"
echo "Done KMZ!"
i=$(( i + MAX_REQUESTS ))
((j++))
done
7z a "LondonArchive/ArcGIS/${SERVICE}/Layer ${LAYERID} - ${LAYERNAME_CLEAN}.7z" "$TMP_STAGING"
done
done
done

14
SCRAPE_ESCRIBE.SH Normal file → Executable file
View File

@ -38,18 +38,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
INDEX_END="FALSE"
while [[ $INDEX_END == "FALSE" ]]; do
INDEX_END=0
while (( ! INDEX_END )); do
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
if [ $? -ne 8 ]; then
FOUNDLIST="FALSE"
FOUNDLIST=0
while IFS= read -r LINE; do
if [[ "TRUE" == $FOUNDLIST ]]; then
if (( FOUNDLIST )); then
GREPENDLIST=$(echo $LINE | grep '<option ')
if [[ "$GREPENDLIST" == "" ]]; then
echo "SCRAPE_ESCRIBE: End of list."
INDEX_END="TRUE"
INDEX_END=1
break
else
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
@ -88,11 +88,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
if [[ "$GREPLIST" != "" ]]; then
echo "SCRAPE_ESCRIBE: Found meeting type list."
FOUNDLIST="TRUE"
FOUNDLIST=1
fi
done < $INDEX_PAGE
else
INDEX_END="TRUE"
INDEX_END=1
echo "SCRAPE_ESCRIBE: Couldn't save index!"
fi
done

329
SCRAPE_GINV.SH Executable file
View File

@ -0,0 +1,329 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# Todo:
# - Save updates (see bradley-ave)
# - Order, title, and collapse each scraped modal
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_ginv.html"
PROJECT_PAGE="./tmp/project_ginv.html"
WORK_HTML="./tmp/tmp.html"
CUSTOM_HTML="./tmp/custom_ginv.html"
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
CUSTOM_HTML_TIMELINE="./tmp/custom_timeline_ginv.html"
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
FULLDUMP="./tmp/.fulldump.txt"
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://getinvolved.london.ca/projects"
set_metadata() {
tmp=$(echo "$1" | sed 's/&amp;/\&/g' | sed 's/&quot;//g' | sed 's/&#039;/'\''/g' | sed 's/\[/''/g' | sed 's/\]/''/g')
PROJECT_NAME=$(_utils_fix_dashes "$(echo $tmp | sed 's/.*data-project-name="\([^"]*\)".*/\1/' | sed 's///g' | sed 's///g' | sed 's/'\''//g' | sed 's/://g')")
PROJECT_CATS=$(echo "$tmp" | sed 's/.*data-project-category="\([^"]*\)".*/\1/')
PROJECT_LOCATION=$(echo "$tmp" | sed 's/.*data-project-location="\([^"]*\)".*/\1/')
}
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
while IFS= read -r LINE; do
if (( FOUND_DATE )) && [[ "$LAST_LINE" == "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
FOUND_DATE=0
echo $PROJECT_URL
echo $PROJECT_NAME
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
# Now we can work on the actual project page.
rm -f $CUSTOM_HTML_LINKS
rm -f $CUSTOM_HTML_PHOTOS
rm -f $CUSTOM_HTML_FAQ
rm -f $CUSTOM_HTML_PROFILE
rm -f $CUSTOM_HTML_TIMELINE
rm -f $CUSTOM_HTML_KEYDATES
rm -f $CUSTOM_HTML_SLIDER
rm -f $FULLDUMP
cat ./template/default_getinvolved.html > $CUSTOM_HTML
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
while IFS= read -r LINE_PROJ; do
if (( NEXT_LINE_CONTENT )); then
# Next hive-block marks end of current item
if [[ "$LINE_PROJ" == *"hive-block"* ]] || [[ "$LINE_PROJ" == "" ]]; then
NEXT_LINE_CONTENT=0
echo "End of current content."
else
# Ignore boring notices
if [[ "$LINE_PROJ" != *"</h1>"* ]] &&
[[ "$LINE_PROJ" != *"City of London Land Acknowledgement"* ]] &&
[[ "$LINE_PROJ" != *"Ongoing Site Specific Planning Applications"* ]] &&
[[ "$LINE_PROJ" != *"This site is owned and operated by the City of London using software licensed from Social Pinpoint"* ]] &&
[[ "$LINE_PROJ" != *"Social Pinpoint has been commissioned by City of London (Canada) to collect and display user content on their behalf"* ]] &&
[[ "$LINE_PROJ" != *"Notice of Collection of Personal Information"* ]] &&
[[ "$LINE_PROJ" != *'href="/register"'* ]] &&
[[ "$LINE_PROJ" != *'href="/login"'* ]] &&
[[ "$LINE_PROJ" != *"Users have the right to access, correct, or delete their personal information"* ]] &&
[[ "$LINE_PROJ" != *"This privacy policy may change from time to time"* ]] &&
#[[ "$LINE_PROJ" != *"Share your feedback"* ]] &&
[[ "$LINE_PROJ" != *"Notice of Collection"* ]] &&
#[[ "$LINE_PROJ" != *"Subscribe for project updates"* ]] &&
[[ "$LINE_PROJ" != *"Ready to have your say?"* ]]; then
# seds to replace youtube iframe with a normal <a href=""> link. wkhtmltopdf obviously can't embed youtube videos.
if (( FIRST_CONTENT )); then
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
FIRST_CONTENT=0
fi
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
fi
fi
fi
if (( IS_DOC_BLOCK )); then
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
IS_DOC_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/download_file/")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_LINKS
fi
echo "End of current documents."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_PHOTO_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!-- end foreach -->"* ]]; then
IS_PHOTO_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
fi
echo "End of current photos."
else
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
fi
fi
fi
if (( IS_FAQ_BLOCK )); then
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
IS_FAQ_BLOCK=0
echo "End of current FAQ."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
fi
if (( IS_PROFILE_BLOCK )); then
if [[ "$LINE_PROJ" == *"<script>"* ]]; then
IS_PROFILE_BLOCK=0
echo "End of current profile."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_TIMELINE_BLOCK )); then
if [[ "$LINE_PROJ" == *"btn-unfill btn-primary"* ]]; then
IS_TIMELINE_BLOCK=0
echo "End of current timeline."
else
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_KEYDATES_BLOCK )); then
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
IS_KEYDATES_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SLIDER_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
IS_SLIDER_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SINGLE_IMAGE_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_SINGLE_IMAGE_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
else
cat "$CUSTOM_HTML_PHOTOS"
fi
echo "End of current single image."
else
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if [[ "$LINE_PROJ" == *"hive-block hive-block-content ljs"* ]]; then
NEXT_LINE_CONTENT=1
FIRST_CONTENT=1
# We'll write the LA comment inside of the content block.
# There we can ensure that the comment is only written if content does exist.
echo "Found content start."
elif [[ "$LINE_PROJ" == *"docLibModal hive-block-document-library"* ]]; then
IS_DOC_BLOCK=1
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
echo "Found documents start."
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
IS_PHOTO_BLOCK=1
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
echo "Found photos start."
elif [[ "$LINE_PROJ" == *"hive-modal faqModal hive-block-faq"* ]]; then
IS_FAQ_BLOCK=1
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
echo "Found FAQ start."
elif [[ "$LINE_PROJ" == *"hive-block-bio hive-block"* ]]; then
IS_PROFILE_BLOCK=1
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
echo "Found profile start."
elif [[ "$LINE_PROJ" == *"hive-block-timeline hive-block"* ]]; then
IS_TIMELINE_BLOCK=1
echo "<!-- LondonArchive_GINV_Timeline -->" >> $FULLDUMP
echo "Found timeline start."
elif [[ "$LINE_PROJ" == *"hive-modal dateModal"* ]]; then
IS_KEYDATES_BLOCK=1
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
echo "Found key dates start."
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
IS_SLIDER_BLOCK=1
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
echo "Found slider start."
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
IS_SINGLE_IMAGE_BLOCK=1
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
echo "Found single image start."
fi
done < $PROJECT_PAGE
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
#cat "$CUSTOM_HTML_LINKS" >> "$CUSTOM_HTML"
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"download_file"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME=$(curl -s -L -I "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" | grep "location:" | sed 's/location: //' | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//' | sed 's/\.pdf./\.pdf/')
echo $DOC_NAME
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_LINKS
fi
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"amazonaws"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
echo $DOC_NAME
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_PHOTOS
fi
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
cat ./template/default_getinvolved.html > $WORK_HTML
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
fi
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
fi
if (( NEXT_LINE_URL )); then
NEXT_LINE_URL=0
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
fi
if [[ "$LAST_LINE" != "" ]]; then
set_metadata "$LAST_LINE$LINE"
LAST_LINE=""
NEXT_LINE_URL=1
elif [[ "$LINE" == *"h-entry project card"* ]] && [[ "$LINE" == *"data-project-name"* ]] && [[ "$LINE" != *"<%-"* ]]; then
#echo $LINE
if [[ "$LINE" != *"data-project-category"* ]]; then
# Sometimes lines are split, so we'll combine the pieces over time.
LAST_LINE=$LINE
echo "Line is split!"
else
LAST_LINE=""
set_metadata "$LINE"
NEXT_LINE_URL=1
fi
elif [[ "$LINE" == *'time class="dt-updated"'* ]]; then
PROJECT_DATE=$(echo $LINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g')
echo $PROJECT_DATE
_time_parse_monddyyyy "$PROJECT_DATE"
echo "$ITEM_YEAR$ITEM_MONTH$ITEM_DAY"
FOUND_DATE=1
fi
done < $SEARCH_PAGE
fi

301
SCRAPE_GINV_OLD.SH Executable file
View File

@ -0,0 +1,301 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# Todo:
# - Save updates (see bradley-ave)
# - Order, title, and collapse each scraped modal
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_ginv.html"
PROJECT_PAGE="./tmp/project_ginv.html"
WORK_HTML="./tmp/tmp.html"
CUSTOM_HTML="./tmp/custom_ginv.html"
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
CUSTOM_HTML_UPDATE="./tmp/custom_update_ginv.html"
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
FULLDUMP="./tmp/.fulldump.txt"
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://getinvolvedlondon.ca.engagementhq.com"
wget --user-agent="$WGET_UA" "$SEARCH_URL/projects" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
while IFS= read -r LINE; do
if [[ "$PROJECT_NAME" != "" ]]; then
FOUND_DATE=0
echo $PROJECT_URL
echo $PROJECT_NAME
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
# Now we can work on the actual project page.
rm -f $CUSTOM_HTML_LINKS
rm -f $CUSTOM_HTML_PHOTOS
rm -f $CUSTOM_HTML_FAQ
rm -f $CUSTOM_HTML_PROFILE
rm -f $CUSTOM_HTML_UPDATE
rm -f $CUSTOM_HTML_KEYDATES
rm -f $CUSTOM_HTML_SLIDER
rm -f $FULLDUMP
cat ./template/default_getinvolved.html > $CUSTOM_HTML
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
while IFS= read -r LINE_PROJ; do
if (( IS_DOC_BLOCK )); then
if [[ "$LINE_PROJ" == "</ul>" ]]; then
IS_DOC_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/documents/")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_LINKS
fi
echo "End of current documents."
elif [[ "$LINE_PROJ" == *"a data-url"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
echo $LINE_PROJ >> $FULLDUMP
echo $LINE_PROJ
fi
fi
if (( IS_PHOTO_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_PHOTO_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
fi
echo "End of current photos."
else
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
fi
fi
fi
if (( IS_FAQ_BLOCK )); then
if [[ "$LINE_PROJ" == *"div class='clearfix'></div"* ]]; then
IS_FAQ_BLOCK=0
echo "End of current FAQ."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
fi
if (( IS_PROFILE_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!--[if IE]>"* ]]; then
IS_PROFILE_BLOCK=0
echo "End of current profile."
else
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_UPDATE_BLOCK )); then
if [[ "$LINE_PROJ" == *"<div class='clearfix'></div>"* ]]; then
IS_UPDATE_BLOCK=0
echo "End of current update."
else
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_KEYDATES_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_KEYDATES_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SLIDER_BLOCK )); then
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
IS_SLIDER_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SINGLE_IMAGE_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_SINGLE_IMAGE_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
else
cat "$CUSTOM_HTML_PHOTOS"
fi
echo "End of current single image."
else
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if [[ "$LINE_PROJ" == *'div class="full-description hide"'* ]]; then
FIRST_CONTENT=1
# We'll write the LA comment inside of the content block.
# There we can ensure that the comment is only written if content does exist.
echo "Found content start."
if (( FIRST_CONTENT )); then
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
FIRST_CONTENT=0
fi
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
elif [[ "$LINE_PROJ" == *"widget-wrap widget_document_library"* ]]; then
IS_DOC_BLOCK=1
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
echo "Found documents start."
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
IS_PHOTO_BLOCK=1
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
echo "Found photos start."
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_recent_photos'"* ]]; then
IS_FAQ_BLOCK=1
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
echo "Found FAQ start."
elif [[ "$LINE_PROJ" == *"widget-wrap widget_project_team"* ]]; then
IS_PROFILE_BLOCK=1
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
echo "Found profile start."
elif [[ "$LINE_PROJ" == *"<div class='fr-view'>"* ]]; then
IS_UPDATE_BLOCK=1
echo "<!-- LondonArchive_GINV_Update -->" >> $FULLDUMP
echo "<h1>Project Updates</h1>" >> $CUSTOM_HTML_UPDATE
echo "Found update start."
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_life_cycle'"* ]]; then
IS_KEYDATES_BLOCK=1
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
echo "Found key dates start."
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
IS_SLIDER_BLOCK=1
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
echo "Found slider start."
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
IS_SINGLE_IMAGE_BLOCK=1
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
echo "Found single image start."
fi
done < $PROJECT_PAGE
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
cat "$CUSTOM_HTML_LINKS" # >> "$CUSTOM_HTML"
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"/documents/"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME="$(echo $LINE_DOC | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/g' | sed 's/ (pdf)//' | sed 's/^ +| +$//g').pdf"
echo "-------- "$DOC_NAME
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')/download" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_LINKS
fi
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
while IFS= read -r LINE_DOC; do
if [[ "$LINE_DOC" == *"ehq-production"* ]]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
echo "======== "$DOC_NAME
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
fi
done < $CUSTOM_HTML_PHOTOS
fi
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
cat ./template/default_getinvolved.html > $WORK_HTML
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
fi
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
IS_DOC_BLOCK=0
IS_PHOTO_BLOCK=0
IS_FAQ_BLOCK=0
IS_PROFILE_BLOCK=0
IS_UPDATE_BLOCK=0
IS_KEYDATES_BLOCK=0
IS_SLIDER_BLOCK=0
IS_SINGLE_IMAGE_BLOCK=0
PROJECT_NAME=""
fi
if (( NEXT_LINE_CONT_NAME )); then
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LAST_LINE$LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's///g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's///g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/&quot;//g' | sed 's/&amp;/and/g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
NEXT_LINE_CONT_NAME=0
echo $PROJECT_NAME
fi
if [[ "$LINE" == *"project-tile__meta__name"* ]]; then
if [[ "$LINE" != *"</span"* ]]; then
NEXT_LINE_CONT_NAME=1
LAST_LINE=$LINE
else
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's///g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's///g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/&quot;//g' | sed 's/&amp;/and/g' | sed 's/&#39;//g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
echo $PROJECT_NAME
fi
fi
if [[ "$LINE" == *"project-tile__link"* ]]; then
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
PROJECT_URL=$(echo $SEARCH_URL$PROJECT_URL)
echo " "$PROJECT_URL
# Reset project name to mark the start of a new project
PROJECT_NAME=""
fi
done < $SEARCH_PAGE
fi

39
SCRAPE_LPS.SH Normal file → Executable file
View File

@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-"
@ -8,28 +8,7 @@ echo -e "-=- Lillian Skinner
echo -e "-=- -=-"
echo -e "-========================================================================-"
conv_date() {
echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
}
source ./functions/.functions
MEETINGS_PAGE="./tmp.html"
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
@ -45,9 +24,9 @@ current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
# If I don't set these values then "10#: invalid integer constant"
MEETING_YEAR="0000"
MEETING_MONTH="00"
MEETING_DAY="00"
ITEM_YEAR="0000"
ITEM_MONTH="00"
ITEM_DAY="00"
while IFS= read -r LINE_PRE; do
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
@ -66,11 +45,11 @@ while IFS= read -r LINE_PRE; do
FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">')
if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then
conv_date "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
echo "$MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
_time_parse_helper "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
echo "$ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')"
mkdir -p "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/"
wget "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" -O "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/$ATTACH_TYPE.pdf" -q
mkdir -p "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/"
_utils_download_helper "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/$ATTACH_TYPE.pdf"
fi
done < "./tmp/index.html"

69
SCRAPE_LTC.SH Normal file → Executable file
View File

@ -7,6 +7,8 @@ echo -e "-=- Lillian Skinner
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
MEETINGS_PAGE="./tmp.html"
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
@ -21,9 +23,9 @@ current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
# If I don't set these values then "10#: invalid integer constant"
MEETING_YEAR="0000"
MEETING_MONTH="00"
MEETING_DAY="00"
ITEM_YEAR="0000"
ITEM_MONTH="00"
ITEM_DAY="00"
while IFS= read -r LINE_PRE; do
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
@ -47,33 +49,16 @@ while IFS= read -r LINE_PRE; do
elif [[ "$GREPDATE" != "" ]]; then
# Remove HTML junk from date string.
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
MEETING_MONTH_WORD=$(echo "$DATES_CLEAN" | sed -E 's/^([A-Za-z]+) .*/\1/')
MEETING_DAY_SHORT=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/')
MEETING_DAY=$(printf "%02d" ${MEETING_DAY_SHORT#0})
MEETING_YEAR=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
_time_parse_helper "$DATES_CLEAN"
echo " NEW MEETING FOUND"
echo " DATE IS $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
echo " DATE IS $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
GREPDATE=""
else
# Has a previous meeting has been set? What about a date?
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
if [[ "COMMITTEENAME" != "" ]] && [[ "MEETING_YEAR" != "" ]] && (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
if [[ "COMMITTEENAME" != "" ]] && [[ "ITEM_YEAR" != "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
# Not changing meetings, and we know that an old meeting has alread been set. Keep going.
# If match --> make folder --> download
@ -85,25 +70,25 @@ while IFS= read -r LINE_PRE; do
# Well... this aged well.
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY" 2> /dev/null
if [[ "$AGENDAURL" != "" ]]; then
echo " DOWNLOAD AGENDA PDF"
echo " $AGENDAURL"
wget --user-agent="$WGET_UA" "$AGENDAURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Agenda.pdf" -c -q #--show-progress
_utils_download_helper "$AGENDAURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Agenda.pdf"
elif [[ "$MINUTESURL" != "" ]]; then
echo " DOWNLOAD MINUTES PDF"
echo " $MINUTESURL"
wget --user-agent="$WGET_UA" "$MINUTESURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Minutes.pdf" -c -q #--show-progress
_utils_download_helper "$MINUTESURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Minutes.pdf"
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
if [[ "$AGENDAHTMLURL" != "" ]]; then
echo " DOWNLOAD AGENDA HTML TO CRAWL"
echo " $AGENDAHTMLURL"
wget --user-agent="$WGET_UA" "$AGENDAHTMLURL" -O "./tmp/work.html" -q #--show-progress
_utils_download_helper "$AGENDAHTMLURL" "./tmp/work.html"
elif [[ "$MINUTESHTMLURL" != "" ]]; then
echo " DOWNLOAD MINUTES HTML TO CRAWL"
echo " $MINUTESHTMLURL"
wget --user-agent="$WGET_UA" "$MINUTESHTMLURL" -O "./tmp/work.html" -q #--show-progress
_utils_download_helper "$MINUTESHTMLURL" "./tmp/work.html"
fi
while IFS= read -r LINE_HTML_PRE; do
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
@ -112,25 +97,25 @@ while IFS= read -r LINE_PRE; do
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
if [[ "$GREPARTICLESTART" != "" ]]; then
echo " FOUND INDEX ARTICLE START"
ISARTICLE="TRUE"
ISARTICLE=1
elif [[ "$GREPARTICLEEND" != "" ]]; then
echo " END OF INDEX ARTICLE"
ISARTICLE=""
elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
ISARTICLE=0
elif [[ "$GREPLINK" != "" ]] && (( ISARTICLE )); then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments" 2> /dev/null
ISPDF=$(echo $GREPLINK | grep "\.pdf")
if [[ "$ISPDF" != "" ]]; then
PDFNAME=$(echo $ISPDF | sed 's/.*\///')
echo " DOWNLOAD ATTACHMENT PDF"
echo " $ISPDF"
wget --user-agent="$WGET_UA" "$ISPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFNAME" -c -q #--show-progress
_utils_download_helper "$ISPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFNAME"
else
# Extract title of attachment
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&amp;/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
echo " DOWNLOAD ATTACHMENT HTML"
echo " $ATTACHTITLE"
echo " $GREPLINK"
wget --user-agent="$WGET_UA" "$GREPLINK" -O "./tmp/attachment.html" -q #--show-progress
_utils_download_helper "$GREPLINK" "./tmp/attachment.html"
while IFS= read -r LINE_ATTACH_PRE; do
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
@ -141,23 +126,23 @@ while IFS= read -r LINE_PRE; do
# CSS for the HTML is in the default template
cat ./template/default.html > ./tmp/new.html
echo "$LINE_ATTACH" >> ./tmp/new.html
ISATTACHMENTARTICLE="TRUE"
ISATTACHMENTARTICLE=1
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
echo " END OF ATTACHMENT ARTICLE"
echo "$LINE_ATTACH" >> ./tmp/new.html
echo " PROCESSED TO PDF"
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
ISATTACHMENTARTICLE=""
elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
ISATTACHMENTARTICLE=0
elif [[ "$GREPATTACHMENTLINK" != "" ]] && (( ISATTACHMENTARTICLE )); then
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
if [[ "$ISREFPDF" != "" ]]; then
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
echo " $GREPATTACHMENTLINK"
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
_utils_download_helper "$ISREFPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFREFNAME"
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
fi
elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then
elif (( ISATTACHMENTARTICLE )); then
echo "$LINE_ATTACH" >> ./tmp/new.html
fi
LINE_ATTACH=""

532
SCRAPE_MEET.SH Normal file → Executable file
View File

@ -1,185 +1,13 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-"
echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
conv_date() {
echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
}
conv_date_alt() {
echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
MEETING_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed 's/.* //')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
}
set_agenda_url() {
case "$1" in
'"Agenda (HTML)"')
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda (PDF)"')
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (HTML)"')
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (PDF)"')
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (HTML)"')
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (PDF)"')
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes with Attachments (PDF)"')
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (HTML)"')
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (PDF)"')
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (HTML)"')
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (PDF)"')
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (HTML)"')
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (PDF)"')
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (HTML)"')
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (PDF)"')
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
esac
}
clear_agenda_url() {
AGENDA_HTML_URL=""
AGENDA_PDF_URL=""
AGENDA_REVISE_HTML_URL=""
AGENDA_REVISE_PDF_URL=""
MINUTES_HTML_URL=""
MINUTES_PDF_URL=""
MINUTES_ATTACH_PDF_URL=""
AGENDA_FULL_HTML_URL=""
AGENDA_FULL_PDF_URL=""
AGENDA_COVER_HTML_URL=""
AGENDA_COVER_PDF_URL=""
AGENDA_POST_HTML_URL=""
AGENDA_POST_PDF_URL=""
ADDENDUM_HTML_URL=""
ADDENDUM_PDF_URL=""
}
download_agendas() {
if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then
if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then
echo "Saving revised agenda as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$1/Agenda_Revised.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_PDF_URL != "" ]]; then
echo "Saving regular agenda as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$1/Agenda.pdf" -N -q #--show-progress
fi
elif [[ $AGENDA_REVISE_HTML_URL != "" ]] || [[ $AGENDA_HTML_URL != "" ]]; then
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
echo "Saving revised agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$1/Agenda_Revised.html" -N -q #--show-progress
fi
if [[ $AGENDA_HTML_URL != "" ]]; then
echo "Saving regular agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$1/Agenda.html" -N -q #--show-progress
fi
elif [[ $AGENDA_FULL_PDF_URL != "" ]] || [[ $AGENDA_FULL_HTML_URL != "" ]]; then
if [[ $AGENDA_FULL_PDF_URL != "" ]]; then
echo "Saving full package agenda as PDF... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_PDF_URL" -O "$1/Agenda_FullPackage.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_FULL_HTML_URL != "" ]]; then
echo "Saving full package agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O "$1/Agenda_FullPackage.html" -N -q #--show-progress
fi
elif [[ $AGENDA_POST_PDF_URL != "" ]] || [[ $AGENDA_POST_HTML_URL != "" ]]; then
if [[ $AGENDA_POST_PDF_URL != "" ]]; then
echo "Saving post agenda as HTML... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_PDF_URL" -O "$1/Agenda_Post.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_POST_HTML_URL != "" ]]; then
echo "Saving post agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O "$1/Agenda_Post.html" -N -q #--show-progress
fi
fi
if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then
if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then
echo "Saving minutes with attachments as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$1/Minutes_With_Attachments.pdf" -N -q #--show-progress
fi
if [[ $MINUTES_PDF_URL != "" ]]; then
echo "Saving minutes as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$1/Minutes.pdf" -N -q #--show-progress
fi
else
if [[ $MINUTES_HTML_URL != "" ]]; then
echo "Saving minutes as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$1/Minutes.html" -N -q #--show-progress
fi
fi
if [[ $AGENDA_COVER_PDF_URL != "" ]]; then
echo "Saving cover agenda as PDF... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_PDF_URL" -O "$1/Agenda_Cover.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_COVER_HTML_URL != "" ]]; then
echo "Saving cover agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O "$1/Agenda_Cover.html" -N -q #--show-progress
fi
if [[ $ADDENDUM_PDF_URL != "" ]]; then
echo "Saving addendum as PDF... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_PDF_URL" -O "$1/Addendum.pdf" -N -q #--show-progress
fi
if [[ $ADDENDUM_HTML_URL != "" ]]; then
echo "Saving addendum as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O "$1/Addendum.html" -N -q #--show-progress
fi
}
source ./functions/.functions
# Warning to all who read this script:
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
@ -196,12 +24,12 @@ ADDENDUM_HTML="./tmp/addendum.html"
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)00
current_day=$(date +%d)
SUPPORT_PAST="FALSE"
SUPPORT_PAST=""
if [ -d "$TEMP_DIR" ]; then
rm -r $TEMP_DIR
rm -r $TEMP_DIR
fi
rm -f $INDEX_PAGE
rm -f $SEARCH_PAGE
@ -209,215 +37,211 @@ rm -f $AGENDA_HTML
mkdir $TEMP_DIR
while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g')
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
INDEX_END="FALSE"
while [[ $INDEX_END == "FALSE" ]]; do
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
if [ $? -ne 8 ]; then
while IFS="," read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
INDEX_URL=$(echo "$INDEX_URL_PRE" | sed 's/\"//g' | sed 's/,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
CITY_ARCHIVE_NAME=$(echo "$CITY_ARCHIVE_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
CALENDAR_NAME=$(echo "$CALENDAR_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
INDEX_END="FALSE"
while [[ $INDEX_END == "FALSE" ]]; do
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --no-hsts --show-progress
if [ $? -ne 8 ]; then
FOUNDLIST="FALSE"
while IFS= read -r LINE; do
if [[ "TRUE" == $FOUNDLIST ]]; then
GREPENDLIST=$(echo $LINE | grep '<option ')
if [[ "$GREPENDLIST" == "" ]]; then
echo "SCRAPE_ESCRIBE: End of list."
INDEX_END="TRUE"
break
else
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
echo "-========================================================================-"
echo "- $MEETING_NAME"
if [[ "TRUE" == $FOUNDLIST ]]; then
GREPENDLIST=$(echo $LINE | grep '<option ')
if [[ "$GREPENDLIST" == "" ]]; then
echo "SCRAPE_ESCRIBE: End of list."
INDEX_END="TRUE"
break
else
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
echo "-========================================================================-"
echo "- $MEETING_NAME"
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
echo "- Corrected to: $MEETING_NAME"
fi
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
echo "- Corrected to: $MEETING_NAME"
fi
# Pages start at 1. Ew.
x=1
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
#cat "${TEMP_DIR}escribe.json" > debug.json
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . >"${TEMP_DIR}escribe.json"
#cat "${TEMP_DIR}escribe.json" > debug.json
y=0
i=0
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
while (true); do
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length' )
y=0
i=0
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
while (true); do
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length')
if [[ "$NUM_IN_JSON" == "" ]]; then
if [[ "$NUM_IN_JSON" == "" ]]; then
break
fi
fi
# Decrease in the meeting count == we're on the final page.
if (( $i >= $NUM_IN_JSON )) && (( 10#$NUM_IN_JSON >= 50)); then
# Decrease in the meeting count == we're on the final page.
if (($i >= $NUM_IN_JSON)) && ((10#$NUM_IN_JSON >= 50)); then
((x++))
i=0
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
elif (( $i >= 10#$NUM_IN_JSON )); then
break
fi
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . >"${TEMP_DIR}escribe.json"
elif (($i >= 10#$NUM_IN_JSON)); then
break
fi
echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
echo "$(($i + 1)) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
# No need to cat the entire file every time.
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' > "${TEMP_DIR}escribe_short.json"
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
# No need to cat the entire file every time.
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' >"${TEMP_DIR}escribe_short.json"
#echo "> Meeting ID"
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
#echo "> Meeting Attachments"
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
#echo "> Meeting ID"
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
#echo "> Meeting Attachments"
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
clear_agenda_url
for ((j=0; j<=(( $NUM_ATTACHMENTS - 1 )); j++)); do
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
done
clear_agenda_url
for ((j = 0; j <= (($NUM_ATTACHMENTS - 1)); j++)); do
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
done
# "25 Feb 2026"
if [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
echo "Alternate date format."
conv_date_alt "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
# "Feb 25 2026"
elif [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
echo "Standard date format."
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
else
echo "COULD NOT FIGURE OUT DATE FORMAT!"
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
fi
_time_parse_helper "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
INPAST=""
if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
echo "NAME : $MEETING_NAME"
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
echo "A (H) : $AGENDA_HTML_URL"
echo "A (P) : $AGENDA_PDF_URL"
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
echo "AF(H) : $AGENDA_FULL_HTML_URL"
echo "AF(P) : $AGENDA_FULL_PDF_URL"
echo "AC(H) : $AGENDA_COVER_HTML_URL"
echo "AC(P) : $AGENDA_COVER_PDF_URL"
echo "AP(H) : $AGENDA_POST_HTML_URL"
echo "AP(P) : $AGENDA_POST_PDF_URL"
echo "M (H) : $MINUTES_HTML_URL"
echo "M (P) : $MINUTES_PDF_URL"
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
echo "AD(H) : $ADDENDUM_HTML_URL"
echo "AD(P) : $ADDENDUM_PDF_URL"
else
echo "Dates are in the past!"
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
INPAST="TRUE"
fi
INPAST=""
if ((10#$ITEM_YEAR >= 10#$current_year)) && ((10#$ITEM_MONTH >= $((10#$current_month - 1)))); then
echo "NAME : $MEETING_NAME"
echo "DATE : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
echo "A (H) : $AGENDA_HTML_URL"
echo "A (P) : $AGENDA_PDF_URL"
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
echo "AF(H) : $AGENDA_FULL_HTML_URL"
echo "AF(P) : $AGENDA_FULL_PDF_URL"
echo "AC(H) : $AGENDA_COVER_HTML_URL"
echo "AC(P) : $AGENDA_COVER_PDF_URL"
echo "AP(H) : $AGENDA_POST_HTML_URL"
echo "AP(P) : $AGENDA_POST_PDF_URL"
echo "M (H) : $MINUTES_HTML_URL"
echo "M (P) : $MINUTES_PDF_URL"
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
echo "AD(H) : $ADDENDUM_HTML_URL"
echo "AD(P) : $ADDENDUM_PDF_URL"
else
echo "Dates are in the past!"
echo "DATE : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
INPAST="TRUE"
fi
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
echo "Abort."
break
fi
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
echo "Abort."
break
fi
#echo "> Meeting Video"
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
#echo "> Meeting Video"
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
ERROR="FALSE"
ADDENDUM_ERROR="FALSE"
echo "Downloading agenda HTML..."
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_FULL_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_POST_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
else
ERROR="TRUE"
fi
ERROR="FALSE"
ADDENDUM_ERROR="FALSE"
echo "Downloading agenda HTML..."
if [[ $ADDENDUM_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress
else
ADDENDUM_ERROR="TRUE"
fi
if [[ -n $AGENDA_REVISE_HTML_URL ]]; then
_utils_download_helper "$AGENDA_REVISE_HTML_URL" "$AGENDA_HTML"
elif [[ -n $AGENDA_HTML_URL ]]; then
_utils_download_helper "$AGENDA_HTML_URL" "$AGENDA_HTML"
if [[ "$ERROR" == "FALSE" ]]; then
elif [[ -n $AGENDA_FULL_HTML_URL ]]; then
_utils_download_helper "$AGENDA_FULL_HTML_URL" "$AGENDA_HTML"
mkdir "./$CITY_ARCHIVE_NAME"
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
elif [[ -n $AGENDA_POST_HTML_URL ]]; then
_utils_download_helper "$AGENDA_POST_HTML_URL" "$AGENDA_HTML"
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
fi
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR/"
fi
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY")
if [ ! -d "$MEETING_DIR" ]; then
mkdir "$MEETING_DIR/"
fi
if [ ! -d "$MEETING_DIR/Attachments" ]; then
mkdir "$MEETING_DIR/Attachments/"
fi
elif [[ -n $AGENDA_COVER_HTML_URL ]]; then
_utils_download_helper "$AGENDA_COVER_HTML_URL" "$AGENDA_HTML"
else
ERROR="TRUE"
fi
if [[ $VIDEO_URL != "" ]]; then
echo "Saving recording URL..."
echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt"
fi
if [[ -n $ADDENDUM_HTML_URL ]]; then
_utils_download_helper "$ADDENDUM_HTML_URL" "$ADDENDUM_HTML"
else
ADDENDUM_ERROR="TRUE"
fi
# Get attachment links
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
# Get attachment names
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
# Get attachment links
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
# Get attachment names
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
fi
# Download attachment and use the name grabbed above
echo "Found the following agenda attachments:"
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
echo "- $LINEA2"
wget --no-check-certificate --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -N -q #--show-progress
done < ./tmp/attachment_urls 3< ./tmp/attachment_names
echo "All attachments saved."
if [[ "$ERROR" == "FALSE" ]]; then
download_agendas "$MEETING_DIR"
mkdir "./$CITY_ARCHIVE_NAME"
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
echo "dir not empty" >> /dev/null
else
rm -r "$MEETING_DIR/Attachments"
fi
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
fi
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$ITEM_YEAR" ]; then
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$ITEM_YEAR/"
fi
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$ITEM_YEAR" "$ITEM_MONTH" "$ITEM_DAY")
if [ ! -d "$MEETING_DIR" ]; then
mkdir "$MEETING_DIR/"
fi
if [ ! -d "$MEETING_DIR/Attachments" ]; then
mkdir "$MEETING_DIR/Attachments/"
fi
echo "All files from this meeting have been saved."
fi
if [[ $VIDEO_URL != "" ]]; then
echo "Saving recording URL..."
echo "https://video.isilive.ca/london/"$VIDEO_URL >"$MEETING_DIR/RecordingLink.txt"
fi
((i++))
((y++))
# Get attachment links
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' >"./tmp/attachment_urls"
# Get attachment names
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' >"./tmp/attachment_names"
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
# Get attachment links
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' >"./tmp/attachment_urls"
# Get attachment names
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' >"./tmp/attachment_names"
fi
# Download attachment and use the name grabbed above
echo "Found the following agenda attachments:"
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
echo "- $LINEA2 / $LINEA1"
_utils_download_helper "$INDEX_URL$LINEA1" "$MEETING_DIR/Attachments/$LINEA2"
# [ ! -s "$MEETING_DIR/Attachments/$LINEA2" ] && rm -f "$MEETING_DIR/Attachments/$LINEA2"
done < ./tmp/attachment_urls 3<./tmp/attachment_names
echo "All attachments saved."
download_agendas "$MEETING_DIR"
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
echo "dir not empty" >>/dev/null
else
rm -r "$MEETING_DIR/Attachments"
fi
echo "All files from this meeting have been saved."
find "$MEETING_DIR" -type f -size 0 -delete
echo "Cleaning PDFs for archive.org..."
find "$MEETING_DIR" -type f -name '*.pdf' -print0 | xargs -0 -n1 qpdf --replace-input
# qpdf repairs and leaves garbage original PDFs
find "$MEETING_DIR" -type f -name '*~qpdf-orig' -delete -print
fi
((i++))
((y++))
done
fi
fi
fi
fi
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
if [[ "$GREPLIST" != "" ]]; then
echo "SCRAPE_ESCRIBE: Found meeting type list."
FOUNDLIST="TRUE"
fi
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
if [[ "$GREPLIST" != "" ]]; then
echo "SCRAPE_ESCRIBE: Found meeting type list."
FOUNDLIST="TRUE"
fi
done < $INDEX_PAGE
else
else
INDEX_END="TRUE"
echo "SCRAPE_ESCRIBE: Couldn't save index!"
fi
done
fi
done
done < websites.csv

47
SCRAPE_MPaS.SH Executable file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_mpas.html"
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://london.ca/government/council-civic-administration/master-plans-strategies/plans-strategies"
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
cat "$SEARCH_PAGE" | sed 's/></>\n</g' | \
while IFS= read -r LINE; do
if (( LAST_LINE_SUMMARY )) && [[ "$LINE" == "<span>"* ]]; then
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<span>\([^<]*\)<\/span>.*/\1/')
echo $CURRENT
fi
LAST_LINE_SUMMARY=0
if echo "$LINE" | grep -q '<summary>'; then
LAST_LINE_SUMMARY=1
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<summary>\([^<]*\)<\/summary>.*/\1/')
echo $CURRENT
fi
if [[ "$LINE" == *".pdf"* ]]; then
FILE="$(echo "$LINE" | grep -o 'href="[^"]*\.pdf"' | sed 's/^href="//; s/"$//; s#^https://london\.ca##' | sed 's/%20/ /g' | sed 's/%27//g')" # Fix stupid sublime syntax highlighting: '
echo $FILE
mkdir -p "./LondonArchive/Master Plans and Strategies/$CURRENT/"
_utils_download_helper "https://london.ca$FILE" "./LondonArchive/Master Plans and Strategies/$CURRENT/$(basename "$FILE")"
fi
done
fi

44
SCRAPE_OPEN.SH Normal file → Executable file
View File

@ -8,15 +8,19 @@ echo -e "-=- Lillian Skinner
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
WORKDIR="./tmp"
STAGEDIR="./staging"
DOCDIR="./LondonArchive_OpenData"
MAPDIR="./LondonArchive_OpenData/Maps"
DOCDIR="./LondonArchive/OpenData"
MAPDIR="./LondonArchive/OpenData/Maps"
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
mkdir $WORKDIR
mkdir $DOCDIR
mkdir $MAPDIR
mkdir -p $WORKDIR
mkdir -p $DOCDIR
mkdir -p $MAPDIR
DOWNLOAD_MAPS=0
i=0
SEARCH_END=0
@ -50,18 +54,25 @@ while [[ $SEARCH_END == 0 ]]; do
echo " Cur. article: $i.$j, URL : $ITEM_URL"
echo " Cur. article: $i.$j, Name : $ITEM_NAME"
rm -rf $STAGEDIR
mkdir $STAGEDIR
#rm -rf $STAGEDIR
#mkdir $STAGEDIR
if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$STAGEDIR/$ITEM_NAME" -c -q
_utils_download_helper "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" "$DOCDIR/$ITEM_NAME"
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$DOCDIR/$ITEM_NAME" -c -q
echo " Downloaded."
echo "Compressing."
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR"
echo "(Not) Compressing."
# No need to compress non-map data.
#7z a "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR"
fi
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]]; then
# This section is depracated. Use SCRAPE_AGIS.SH instead.
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]] && (( DOWNLOAD_MAPS )); then
MAPDIR_ITEM=$(echo "$MAPDIR/$ITEM_TITLE")
mkdir -p "$MAPDIR_ITEM"
echo "Item: $MAPDIR_ITEM"
MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')"
echo " ^^^ Item is map. ($MAP_ID) "
# https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1
@ -71,21 +82,22 @@ while [[ $SEARCH_END == 0 ]]; do
MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1"
MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1"
echo " Map URL (CSV) : $MAP_CSV"
wget --user-agent="$WGET_UA" "$MAP_CSV" -O "$STAGEDIR/$ITEM_TITLE.csv" -c -q
_utils_download_helper "$MAP_CSV" "$MAPDIR_ITEM/$ITEM_TITLE.csv"
echo " Downloaded."
echo " Map URL (Shapefile): $MAP_SHP"
wget --user-agent="$WGET_UA" "$MAP_SHP" -O "$STAGEDIR/$ITEM_TITLE.shp" -c -q
_utils_download_helper "$MAP_SHP" "$MAPDIR_ITEM/$ITEM_TITLE.shp"
echo " Downloaded."
echo " Map URL (GeoJSON) : $MAP_GEO"
wget --user-agent="$WGET_UA" "$MAP_GEO" -O "$STAGEDIR/$ITEM_TITLE.geojson" -c -q
_utils_download_helper "$MAP_GEO" "$MAPDIR_ITEM/$ITEM_TITLE.geojson"
echo " Downloaded."
echo " Map URL (KML) : $MAP_KML"
wget --user-agent="$WGET_UA" "$MAP_KML" -O "$STAGEDIR/$ITEM_TITLE.kml" -c -q
_utils_download_helper "$MAP_KML" "$MAPDIR_ITEM/$ITEM_TITLE.kml"
echo " Downloaded."
echo ' Source URL is $ITEM_URL.'
echo "Compressing."
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$MAPDIR/$ITEM_TITLE.7z" "$STAGEDIR"
rm -f "$MAPDIR_ITEM/$ITEM_TITLE.7z"
7z a "$MAPDIR_ITEM/$ITEM_TITLE.7z" "$MAPDIR_ITEM"
fi
done

78
SCRAPE_PLAN.SH Normal file → Executable file
View File

@ -7,49 +7,7 @@ echo -e "-=- Lillian Skinner
echo -e "-=- -=-"
echo -e "-========================================================================-"
conv_date_plan() {
PROJECT_TIME_YEAR=$(echo $1 | sed 's/.*\([0-9]\{4\}\).*/\1/p' | uniq)
PROJECT_TIME_MONTH_WORD=$(echo $1 | sed 's/.*,\s*\([A-Za-z]*\)\s[0-9]\{1,2\},.*/\1/p' | uniq)
PROJECT_TIME_DAY_SHORT=$(echo $1 | sed 's/.*,\s*[A-Za-z]*\s\([0-9]\{1,2\}\),.*/\1/p' | uniq)
PROJECT_TIME_DAY=$(printf "%02d" $PROJECT_TIME_DAY_SHORT)
case "$PROJECT_TIME_MONTH_WORD" in
Jan*) PROJECT_TIME_MONTH="01" ;;
Feb*) PROJECT_TIME_MONTH="02" ;;
Mar*) PROJECT_TIME_MONTH="03" ;;
Apr*) PROJECT_TIME_MONTH="04" ;;
May) PROJECT_TIME_MONTH="05" ;;
Jun*) PROJECT_TIME_MONTH="06" ;;
Jul*) PROJECT_TIME_MONTH="07" ;;
Aug*) PROJECT_TIME_MONTH="08" ;;
Sep*) PROJECT_TIME_MONTH="09" ;;
Oct*) PROJECT_TIME_MONTH="10" ;;
Nov*) PROJECT_TIME_MONTH="11" ;;
Dec*) PROJECT_TIME_MONTH="12" ;;
*) PROJECT_TIME_MONTH="--" ;;
esac
}
conv_date() {
MODIFIED_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MODIFIED_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MODIFIED_DAY=$(printf "%02d" $MODIFIED_DAY_SHORT)
MODIFIED_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MODIFIED_MONTH_WORD" in
Jan*) MODIFIED_MONTH="01" ;;
Feb*) MODIFIED_MONTH="02" ;;
Mar*) MODIFIED_MONTH="03" ;;
Apr*) MODIFIED_MONTH="04" ;;
May) MODIFIED_MONTH="05" ;;
Jun*) MODIFIED_MONTH="06" ;;
Jul*) MODIFIED_MONTH="07" ;;
Aug*) MODIFIED_MONTH="08" ;;
Sep*) MODIFIED_MONTH="09" ;;
Oct*) MODIFIED_MONTH="10" ;;
Nov*) MODIFIED_MONTH="11" ;;
Dec*) MODIFIED_MONTH="12" ;;
*) MODIFIED_MONTH="--" ;;
esac
}
source ./functions/.functions
# Warning to all who read this script:
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
@ -81,8 +39,8 @@ mkdir $TEMP_DIR
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
j=0
SEARCH_END="FALSE"
while [[ $SEARCH_END == "FALSE" ]]; do
SEARCH_END=0
while (( ! SEARCH_END )); do
echo "-========================================================================-"
echo "Downloading search results... Page $j"
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
@ -107,11 +65,11 @@ while [[ $SEARCH_END == "FALSE" ]]; do
PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&amp;/\&/g' | sed 's/&#039;/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-')
echo " Found project: $PROJECT_NAME"
MODIFIED_MONTH=""
MODIFIED_YEAR=""
conv_date "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
if (( 10#$MODIFIED_YEAR >= 10#$current_year )) && (( 10#$MODIFIED_MONTH >= $((10#$current_month - 1)) )); then
echo "Last Modified: $MODIFIED_YEAR/$MODIFIED_MONTH/$MODIFIED_DAY"
ITEM_MONTH=""
ITEM_YEAR=""
_time_parse_helper "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
if (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
echo "Last Modified: $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
else
echo "Dates are in the past! Abort."
break
@ -124,8 +82,8 @@ while [[ $SEARCH_END == "FALSE" ]]; do
rm -f $PROJECT_IMAGE_NAMES
while IFS= read -r PLINE; do
if [[ "$NEXT_LINE_FITEM" == "TRUE" ]]; then
NEXT_LINE_FITEM="FALSE"
if (( NEXT_LINE_FITEM )); then
NEXT_LINE_FITEM=0
# Is this line an actual item?
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
@ -208,15 +166,15 @@ while [[ $SEARCH_END == "FALSE" ]]; do
fi
PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime")
if [[ $PROJECT_FOUND_TIME != "" ]]; then
conv_date_plan "$PLINE"
echo "Found date : $PROJECT_TIME_YEAR/$PROJECT_TIME_MONTH/$PROJECT_TIME_DAY"
_time_parse_helper "$(echo $PLINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g' | cut -d, -f2- | cut -d\ -f2-)"
echo "Found date : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
fi
fi
fi
fi
if [[ "$NEXT_LINE_IMAGE" == "TRUE" ]]; then
NEXT_LINE_IMAGE="FALSE"
if (( NEXT_LINE_IMAGE )); then
NEXT_LINE_IMAGE=0
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
@ -243,14 +201,14 @@ while [[ $SEARCH_END == "FALSE" ]]; do
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
NEXT_LINE_FITEM="TRUE"
NEXT_LINE_FITEM=1
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
# We're setting a flag to let the script know if an upcoming line is contents.
fi
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
NEXT_LINE_IMAGE="TRUE"
NEXT_LINE_IMAGE=1
# Same idea as before but for the image shown on the main page.
fi
@ -340,11 +298,11 @@ while [[ $SEARCH_END == "FALSE" ]]; do
fi
done < $SEARCH_PAGE
else
SEARCH_END="TRUE"
SEARCH_END=1
echo "No more pages!"
fi
else
SEARCH_END="TRUE"
SEARCH_END=1
echo "No more pages!"
fi
((j++))

9
functions/.functions Normal file
View File

@ -0,0 +1,9 @@
sdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# General
source "$sdir/.functions.time"
source "$sdir/.functions.utils"
# Script specific
source "$sdir/.functions.filepro"
source "$sdir/.functions.escribe"

View File

@ -0,0 +1,133 @@
set_agenda_url() {
case "$1" in
'"Agenda (HTML)"')
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda (PDF)"')
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (HTML)"')
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (PDF)"')
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (HTML)"')
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (PDF)"')
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes with Attachments (PDF)"')
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (HTML)"')
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (PDF)"')
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (HTML)"')
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (PDF)"')
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (HTML)"')
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (PDF)"')
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (HTML)"')
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (PDF)"')
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
esac
}
clear_agenda_url() {
AGENDA_HTML_URL=""
AGENDA_PDF_URL=""
AGENDA_REVISE_HTML_URL=""
AGENDA_REVISE_PDF_URL=""
MINUTES_HTML_URL=""
MINUTES_PDF_URL=""
MINUTES_ATTACH_PDF_URL=""
AGENDA_FULL_HTML_URL=""
AGENDA_FULL_PDF_URL=""
AGENDA_COVER_HTML_URL=""
AGENDA_COVER_PDF_URL=""
AGENDA_POST_HTML_URL=""
AGENDA_POST_PDF_URL=""
ADDENDUM_HTML_URL=""
ADDENDUM_PDF_URL=""
}
download_agendas() {
local outdir="$1"
if [[ -n $AGENDA_REVISE_PDF_URL ]]; then
echo "Saving revised agenda as PDF..."
_utils_download_helper "$AGENDA_REVISE_PDF_URL" "$outdir/Agenda_Revised.pdf"
fi
if [[ -n $AGENDA_PDF_URL ]]; then
echo "Saving regular agenda as PDF..."
_utils_download_helper "$AGENDA_PDF_URL" "$outdir/Agenda.pdf"
fi
if [[ -z $AGENDA_REVISE_PDF_URL && -n $AGENDA_REVISE_HTML_URL ]]; then
echo "Saving revised agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_REVISE_HTML_URL" "$outdir/Agenda_Revised.html"
fi
if [[ -z $AGENDA_PDF_URL && -n $AGENDA_HTML_URL ]]; then
echo "Saving regular agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_HTML_URL" "$outdir/Agenda.html"
fi
if [[ -n $AGENDA_FULL_PDF_URL ]]; then
echo "Saving full package agenda as PDF... (no HTML found!)"
_utils_download_helper "$AGENDA_FULL_PDF_URL" "$outdir/Agenda_FullPackage.pdf"
fi
if [[ -z $AGENDA_FULL_PDF_URL && -n $AGENDA_FULL_HTML_URL ]]; then
echo "Saving full package agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_FULL_HTML_URL" "$outdir/Agenda_FullPackage.html"
fi
if [[ -n $AGENDA_POST_PDF_URL ]]; then
echo "Saving post agenda as PDF..."
_utils_download_helper "$AGENDA_POST_PDF_URL" "$outdir/Agenda_Post.pdf"
fi
if [[ -z $AGENDA_POST_PDF_URL && -n $AGENDA_POST_HTML_URL ]]; then
echo "Saving post agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_POST_HTML_URL" "$outdir/Agenda_Post.html"
fi
if [[ -n $MINUTES_ATTACH_PDF_URL ]]; then
echo "Saving minutes with attachments as PDF..."
_utils_download_helper "$MINUTES_ATTACH_PDF_URL" "$outdir/Minutes_With_Attachments.pdf"
fi
if [[ -n $MINUTES_PDF_URL ]]; then
echo "Saving minutes as PDF..."
_utils_download_helper "$MINUTES_PDF_URL" "$outdir/Minutes.pdf"
fi
if [[ -z $MINUTES_ATTACH_PDF_URL && -z $MINUTES_PDF_URL && -n $MINUTES_HTML_URL ]]; then
echo "Saving minutes as HTML... (no PDF found!)"
_utils_download_helper "$MINUTES_HTML_URL" "$outdir/Minutes.html"
fi
if [[ -n $AGENDA_COVER_PDF_URL ]]; then
echo "Saving cover agenda as PDF... (no HTML found!)"
_utils_download_helper "$AGENDA_COVER_PDF_URL" "$outdir/Agenda_Cover.pdf"
fi
if [[ -z $AGENDA_COVER_PDF_URL && -n $AGENDA_COVER_HTML_URL ]]; then
echo "Saving cover agenda as HTML... (no PDF found!)"
_utils_download_helper "$AGENDA_COVER_HTML_URL" "$outdir/Agenda_Cover.html"
fi
if [[ -n $ADDENDUM_PDF_URL ]]; then
echo "Saving addendum as PDF... (no HTML found!)"
_utils_download_helper "$ADDENDUM_PDF_URL" "$outdir/Addendum.pdf"
fi
if [[ -z $ADDENDUM_PDF_URL && -n $ADDENDUM_HTML_URL ]]; then
echo "Saving addendum as HTML... (no PDF found!)"
_utils_download_helper "$ADDENDUM_HTML_URL" "$outdir/Addendum.html"
fi
}

View File

@ -0,0 +1,34 @@
_filepro_download_folder() {
if [ "$#" -eq 0 ]; then
echo "Usage: <input string>"
exit 1
fi
local tmp_index
tmp_index=$(mktemp)
local tmp_dir
tmp_dir="$1"
local LINE
local LINE_ID
local LINE_TITLE
local LINE_TYPE
wget --no-check-certificate --user-agent="$WGET_UA" "$2" -O "$tmp_index" --no-hsts -q
echo "Looking in folder $3/$LINE_ID"
echo "Download to $tmp_dir/"
while IFS= read -r LINE; do
LINE_ID=$(echo $LINE | sed 's/.*data-id="\([^"]*\)".*/\1/g')
LINE_TITLE=$(echo $LINE | sed 's/.*data-title="\([^"]*\)".*/\1/g' | sed 's/&amp;/\&/g' | sed 's/&#039;/'\''/g' | sed 's/&#39;/'\''/g')
LINE_TYPE=$(echo $LINE | sed 's/.*data-type="\([^"]*\)".*/\1/g')
if [[ "$LINE_TYPE" == "document" ]]; then
echo "Found document: $LINE_ID : $LINE_TITLE.pdf... downloading..."
mkdir -p "$tmp_dir"
_utils_download_helper "${START_URL}/document/$LINE_ID" "$tmp_dir/$LINE_TITLE.pdf"
elif [[ "$LINE_TYPE" == "folder" ]]; then
_filepro_download_folder "$tmp_dir/$LINE_TITLE" "${START_URL}/filepro/documents/$LINE_ID" "$3/$LINE_ID"
fi
done < "$tmp_index"
rm -f $tmp_index
}

71
functions/.functions.time Normal file
View File

@ -0,0 +1,71 @@
_time_parse_helper() {
if [ "$#" -eq 0 ]; then
echo "Usage: <date>"
exit 1
fi
echo $1
if [[ "$(echo $1 | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
_time_parse_ddmonyyyy "$1"
elif [[ "$(echo $1 | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
_time_parse_monddyyyy "$1"
else
echo "COULD NOT FIGURE OUT DATE FORMAT!"
return 1
fi
}
_time_parse_monddyyyy() {
if [ "$#" -eq 0 ]; then
echo "Usage: <date in mon dd yyyy>"
exit 1
fi
ITEM_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
ITEM_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
ITEM_DAY=$(printf "%02d" $ITEM_DAY_SHORT)
ITEM_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$ITEM_MONTH_WORD" in
Jan*) ITEM_MONTH="01" ;;
Feb*) ITEM_MONTH="02" ;;
Mar*) ITEM_MONTH="03" ;;
Apr*) ITEM_MONTH="04" ;;
May) ITEM_MONTH="05" ;;
Jun*) ITEM_MONTH="06" ;;
Jul*) ITEM_MONTH="07" ;;
Aug*) ITEM_MONTH="08" ;;
Sep*) ITEM_MONTH="09" ;;
Oct*) ITEM_MONTH="10" ;;
Nov*) ITEM_MONTH="11" ;;
Dec*) ITEM_MONTH="12" ;;
*) ITEM_MONTH="--" ;;
esac
}
_time_parse_ddmonyyyy() {
if [ "$#" -eq 0 ]; then
echo "Usage: <date in dd mon yyyy>"
exit 1
fi
ITEM_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
ITEM_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
ITEM_DAY=$(printf "%02d" $ITEM_DAY_SHORT)
ITEM_YEAR=$(echo "$1" | sed 's/.* //')
case "$ITEM_MONTH_WORD" in
Jan*) ITEM_MONTH="01" ;;
Feb*) ITEM_MONTH="02" ;;
Mar*) ITEM_MONTH="03" ;;
Apr*) ITEM_MONTH="04" ;;
May) ITEM_MONTH="05" ;;
Jun*) ITEM_MONTH="06" ;;
Jul*) ITEM_MONTH="07" ;;
Aug*) ITEM_MONTH="08" ;;
Sep*) ITEM_MONTH="09" ;;
Oct*) ITEM_MONTH="10" ;;
Nov*) ITEM_MONTH="11" ;;
Dec*) ITEM_MONTH="12" ;;
*) ITEM_MONTH="--" ;;
esac
}

104
functions/.functions.utils Normal file
View File

@ -0,0 +1,104 @@
_utils_ocrmypdf() {
if [ "$#" -eq 0 ]; then
echo "Usage: <in.pdf> <out.pdf>"
exit 1
fi
# https://stackoverflow.com/questions/7997399/bash-script-to-check-pdfs-are-ocrd
MYFONTS=$(pdffonts -l 5 "$1" | tail -n +3 | cut -d' ' -f1 | sort | uniq)
if [ "$MYFONTS" = '' ] || [ "$MYFONTS" = '[none]' ]; then
echo "NOT OCRed yet. Working..."
else
echo "$1 is already OCRed. Saving as is."
cp "$1" "$2"
exit 0
fi
in="$1"
out="$2"
tmp=$(mktemp -d) || return 1
pdfseparate "$in" "$tmp/page-%04d.pdf" || return 1
i=0
for page in "$tmp"/page-*.pdf; do
img="$tmp/img-$i.png"
qpdf --replace-input --rotate=0:1-z "$page"
pdftoppm -singlefile -r 300 -png -cropbox "$page" "$tmp/img-$i" || return 1
# Checks rotations. Annoying way to do it but whatever.
rotation=$(tesseract "$img" stdout --psm 0 2>/dev/null | awk -F': ' '/Rotate/ {print $2}')
case "$rotation" in
180) convert "$img" -rotate 180 "$img" ;;
90) convert "$img" -rotate 90 "$img" ;;
270) convert "$img" -rotate 270 "$img";;
esac
ocrmypdf \
--skip-text \
--clean \
--optimize 1 \
--jobs 1 \
"$img" "$tmp/ocr-$i-tmp.pdf" || return 1
case "$rotation" in
90) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
270) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
esac
mv "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf"
i=$((i+1))
done
pdfunite $(ls -v ${tmp}/ocr*.pdf) "$out" || return 1
}
_utils_fix_dashes() {
if [ "$#" -eq 0 ]; then
echo "Usage: <input string>"
exit 1
fi
perl -CSDA -MURI::Escape -MUnicode::Normalize -e '
binmode STDOUT, ":utf8";
my $s = shift // "";
my $prev;
do { $prev = $s; $s = uri_unescape($s); } while ($s ne $prev);
$s = NFKC($s);
$s =~ tr/\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}/-/;
$s =~ s/&/and/g;
$s =~ tr/\x{2018}\x{2019}\x{201B}/\x27/;
$s =~ tr/\x{201C}\x{201D}/"/;
$s =~ tr/\x{00A0}/ /;
$s =~ s/[\x{200B}\x{200C}\x{200D}\x{FEFF}]//g;
$s =~ s/\s+/ /g;
$s =~ s/^\s+|\s+$//g;
$s =~ s/\s+(\.[^. ]+)$/$1/;
print $s;
' "$1"
}
_utils_download_helper() {
if [ "$#" -eq 0 ]; then
echo "Usage: <url> <outfile>"
exit 1
fi
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
local url="$1"
local out="$2"
local code
code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
case "$code" in
200)
echo "Downloaded."
;;
304)
echo "Already exists! Skipping."
;;
*)
echo "FAILED! $code: $out | $url" >&2
rm -f "$out"
return 1
;;
esac
}

78
template/default.html Normal file
View File

@ -0,0 +1,78 @@
<meta charset="UTF-8">
<style>
body {
width: 90%;
min-width: 600px;
position: relative;
margin-left: auto;
margin-right: auto;
color: #666;
font-size: 16px;
font-family: Frutiger,"Helvetica Neue",Helvetica,Arial,sans-serif;
font-weight: 300;
}
strong {
font-weight: 700;
}
p {
color: #000000;
}
.h1, .h2, .h3, .h4, .h5, .h6, .post-teaser.featured .post-title, h1, h2, h3, h4, h5, h6 {
font-family: Gnuolane,"Helvetica Neue",Helvetica,Arial,sans-serif;
font-weight: 700;
line-height: 1.1;
color: #087ac0;
}
table {
display: table;
margin-bottom: 2em;
min-width: 100%;
border-spacing: 0;
border-collapse: collapse;
border-color: #ccc;
background-color: transparent;
line-height: 1.5;
}
.table-responsive {
overflow-x: auto;
}
table tbody {
display: table-row-group;
vertical-align: top;
border-color: inherit;
}
table tbody > tr:nth-of-type(2n+1) {
background-color: #f2f2f2;
}
table tr {
display: table-row;
vertical-align: inherit;
border-color: inherit;
}
table tbody > tr > td, table tbody > tr > th, table thead > tr > th {
padding: 8px;
border: 1px solid #ccc;
vertical-align: top;
}
table td {
display: table-cell;
border: 1px solid #ccc;
}
ol, ul {
margin-top: 0;
margin-bottom: 12px;
}
ol, ul {
box-sizing: border-box;
}
ol li {
padding-left: 10px;
}
ol li, ul li {
padding-bottom: 12px;
}
address, dd, dt, li, p {
line-height: 1.5;
}
</style>

View File

@ -0,0 +1,83 @@
<meta charset="UTF-8">
<style>
td, h1, h2, h3, p, b, div, i, span, label, ul, li, tr, table { page-break-inside: avoid; }
body {
width: 90%;
min-width: 600px;
position: relative;
margin-left: auto;
margin-right: auto;
color: #666;
font-size: 16px;
font-family: Frutiger,"Helvetica Neue",Helvetica,Arial,sans-serif;
font-weight: 300;
}
strong {
font-weight: 700;
}
p {
color: #000000;
}
.h1, .h2, .h3, .h4, .h5, .h6, .post-teaser.featured .post-title, h1, h2, h3, h4, h5, h6 {
font-family: Gnuolane,"Helvetica Neue",Helvetica,Arial,sans-serif;
font-weight: 700;
line-height: 1.1;
color: #087ac0;
}
table {
display: table;
margin-bottom: 2em;
min-width: 100%;
border-spacing: 0;
border-collapse: collapse;
border-color: #ccc;
background-color: transparent;
line-height: 1.5;
}
.table-responsive {
overflow-x: auto;
}
table tbody {
display: table-row-group;
vertical-align: top;
border-color: inherit;
}
table tbody > tr:nth-of-type(2n+1) {
background-color: #f2f2f2;
}
table tr {
display: table-row;
vertical-align: inherit;
border-color: inherit;
}
table tbody > tr > td, table tbody > tr > th, table thead > tr > th {
padding: 8px;
border: 1px solid #ccc;
vertical-align: top;
}
table td {
display: table-cell;
border: 1px solid #ccc;
}
ol, ul {
margin-top: 0;
margin-bottom: 12px;
}
ol, ul {
box-sizing: border-box;
}
ol li {
padding-left: 10px;
}
ol li, ul li {
padding-bottom: 12px;
}
address, dd, dt, li, p {
line-height: 1.5;
}
img {
max-width: 100% !important;
height: auto !important;
}
</style>

35
websites.csv Normal file → Executable file
View File

@ -1,34 +1,3 @@
"https://pub-brampton.escribemeetings.com/", "SubBramptonArchive", ""
"https://pub-markham.escribemeetings.com/", "SubMarkhamArchive", ""
"https://pub-cityofkingston.escribemeetings.com/", "SubKingstonArchive", ""
"https://pub-barrie.escribemeetings.com/", "SubBarrieArchive", ""
"https://pub-oshawa.escribemeetings.com/", "SubOshawaArchive", ""
"https://pub-london.escribemeetings.com/", "LondonArchive", ""
"https://pub-stthomas.escribemeetings.com/", "StThomasArchive", ""
"https://pub-ottawa.escribemeetings.com/", "OttawaArchive", ""
"https://pub-owensound.escribemeetings.com/", "SubOwenSoundArchive", ""
"https://pub-goderich.escribemeetings.com/", "SubGoderichArchive", ""
"https://pub-oakville.escribemeetings.com/", "SubOakvilleArchive", ""
"https://burlingtonpublishing.escribemeetings.com/", "SubBurlingtonArchive", ""
"https://pub-milton.escribemeetings.com/", "SubMiltonArchive", ""
"https://pub-durhamregion.escribemeetings.com/", "SubDurhamArchive", ""
"https://pub-richmondhill.escribemeetings.com/", "SubRichmondHillArchive", ""
"https://pub-whitby.escribemeetings.com/", "SubWhitbyArchive", ""
"https://pub-london.escribemeetings.com/", "LondonArchive", "London Meetings"
"https://pub-middlesexcounty.escribemeetings.com/", "SubMiddlesexCountyArchive", ""
"https://pub-lucanbiddulph.escribemeetings.com/", "SubLucanBiddulphArchive", ""
"https://pub-thamescentre.escribemeetings.com/", "SubThamesCentreArchive", ""
"https://pub-stthomas.escribemeetings.com/", "SubStThomasArchive", ""
"https://pub-northmiddlesex.escribemeetings.com/", "SubNorthMiddlesexArchive", ""
"https://pub-strathroy-caradoc.escribemeetings.com/", "SubStrathroyCaradocArchive", ""
"https://pub-adelaidemetcalfe.escribemeetings.com/", "SubAdelaideMetcalfeArchive", ""
"https://pub-middlesexcentre.escribemeetings.com/", "SubMiddsexCentreArchive", ""
"https://pub-mississauga.escribemeetings.com/", "SubMississaugaArchive", ""
"https://pub-guelph.escribemeetings.com/", "SubGuelphArchive", ""
"https://pub-regionofwaterloo.escribemeetings.com/", "SubWaterlooArchive", ""
"https://pub-kitchener.escribemeetings.com/", "SubKitchenerArchive", ""
"https://pub-hamilton.escribemeetings.com/", "SubHamiltonArchive", ""
"https://pub-brantford.escribemeetings.com/", "SubBrantfordArchive", ""
"https://pub-woodstock.escribemeetings.com/", "SubWoodstockArchive", ""
"https://pub-stratford.escribemeetings.com/", "SubStratfordArchive", ""
"https://pub-chatham-kent.escribemeetings.com/", "SubChathamKentArchive", ""
"https://pub-cambridge.escribemeetings.com/", "SubCambridgeArchive", ""
"https://pub-vaughan.escribemeetings.com/", "SubVaughanArchive", ""

1 https://pub-brampton.escribemeetings.com/ https://pub-london.escribemeetings.com/ SubBramptonArchive LondonArchive
2 https://pub-markham.escribemeetings.com/ https://pub-stthomas.escribemeetings.com/ SubMarkhamArchive StThomasArchive
https://pub-cityofkingston.escribemeetings.com/ SubKingstonArchive
https://pub-barrie.escribemeetings.com/ SubBarrieArchive
https://pub-oshawa.escribemeetings.com/ SubOshawaArchive
3 https://pub-ottawa.escribemeetings.com/ https://pub-ottawa.escribemeetings.com/ OttawaArchive OttawaArchive
https://pub-owensound.escribemeetings.com/ SubOwenSoundArchive
https://pub-goderich.escribemeetings.com/ SubGoderichArchive
https://pub-oakville.escribemeetings.com/ SubOakvilleArchive
https://burlingtonpublishing.escribemeetings.com/ SubBurlingtonArchive
https://pub-milton.escribemeetings.com/ SubMiltonArchive
https://pub-durhamregion.escribemeetings.com/ SubDurhamArchive
https://pub-richmondhill.escribemeetings.com/ SubRichmondHillArchive
https://pub-whitby.escribemeetings.com/ SubWhitbyArchive
https://pub-london.escribemeetings.com/ LondonArchive London Meetings
https://pub-middlesexcounty.escribemeetings.com/ SubMiddlesexCountyArchive
https://pub-lucanbiddulph.escribemeetings.com/ SubLucanBiddulphArchive
https://pub-thamescentre.escribemeetings.com/ SubThamesCentreArchive
https://pub-stthomas.escribemeetings.com/ SubStThomasArchive
https://pub-northmiddlesex.escribemeetings.com/ SubNorthMiddlesexArchive
https://pub-strathroy-caradoc.escribemeetings.com/ SubStrathroyCaradocArchive
https://pub-adelaidemetcalfe.escribemeetings.com/ SubAdelaideMetcalfeArchive
https://pub-middlesexcentre.escribemeetings.com/ SubMiddsexCentreArchive
https://pub-mississauga.escribemeetings.com/ SubMississaugaArchive
https://pub-guelph.escribemeetings.com/ SubGuelphArchive
https://pub-regionofwaterloo.escribemeetings.com/ SubWaterlooArchive
https://pub-kitchener.escribemeetings.com/ SubKitchenerArchive
https://pub-hamilton.escribemeetings.com/ SubHamiltonArchive
https://pub-brantford.escribemeetings.com/ SubBrantfordArchive
https://pub-woodstock.escribemeetings.com/ SubWoodstockArchive
https://pub-stratford.escribemeetings.com/ SubStratfordArchive
https://pub-chatham-kent.escribemeetings.com/ SubChathamKentArchive
https://pub-cambridge.escribemeetings.com/ SubCambridgeArchive
https://pub-vaughan.escribemeetings.com/ SubVaughanArchive