From 16c4905b4112a4842cded6099d88ba387107cfa4 Mon Sep 17 00:00:00 2001 From: Lillian Skinner <56081713+rvtr@users.noreply.github.com> Date: Fri, 19 Jun 2026 23:30:51 -0400 Subject: [PATCH] Separate functions --- .gitignore | 4 + README.MD | 6 +- SCRAPE_AGIS.SH | 66 ++++ SCRAPE_ESCRIBE.SH | 14 +- SCRAPE_GINV.SH | 329 ++++++++++++++++++ SCRAPE_GINV_OLD.SH | 301 +++++++++++++++++ SCRAPE_LPS.SH | 39 +-- SCRAPE_LTC.SH | 69 ++-- SCRAPE_MEET.SH | 532 ++++++++++-------------------- SCRAPE_MPaS.SH | 47 +++ SCRAPE_OPEN.SH | 44 ++- SCRAPE_PLAN.SH | 78 +---- functions/.functions | 9 + functions/.functions.escribe | 133 ++++++++ functions/.functions.filepro | 34 ++ functions/.functions.time | 71 ++++ functions/.functions.utils | 104 ++++++ template/default.html | 78 +++++ template/default_getinvolved.html | 83 +++++ websites.csv | 35 +- 20 files changed, 1531 insertions(+), 545 deletions(-) create mode 100644 .gitignore mode change 100644 => 100755 README.MD create mode 100755 SCRAPE_AGIS.SH mode change 100644 => 100755 SCRAPE_ESCRIBE.SH create mode 100755 SCRAPE_GINV.SH create mode 100755 SCRAPE_GINV_OLD.SH mode change 100644 => 100755 SCRAPE_LPS.SH mode change 100644 => 100755 SCRAPE_LTC.SH mode change 100644 => 100755 SCRAPE_MEET.SH create mode 100755 SCRAPE_MPaS.SH mode change 100644 => 100755 SCRAPE_OPEN.SH mode change 100644 => 100755 SCRAPE_PLAN.SH create mode 100644 functions/.functions create mode 100644 functions/.functions.escribe create mode 100644 functions/.functions.filepro create mode 100644 functions/.functions.time create mode 100644 functions/.functions.utils create mode 100644 template/default.html create mode 100644 template/default_getinvolved.html mode change 100644 => 100755 websites.csv diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..45ac61f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +LondonArchive/ +LondonScrapers_privdata/ +tmp/ +staging/ diff --git a/README.MD b/README.MD old mode 100644 new mode 100755 index c1cecbc..3591d35 --- a/README.MD +++ b/README.MD @@ -17,7 +17,7 @@ YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS! ## Scrape eScribe meetings (SCRAPE_MEET.SH) -This bash script will scrape meetings from the eScribe meetings platform. +This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST=1` (true), meetings older than 2 months will be downloaded. Otherwise, they will be skipped. The basic structure of the output files is: ``` @@ -76,7 +76,7 @@ The basic structure of the output files is: This bash script will scrape LTC meetings from their wordpress site at: https://www.londontransit.ca/agendas-and-minutes/ -Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low. +Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low. The HTML --> PDF conversion needs the template page included at `./template/default.html`. The basic structure of the output files is: ``` @@ -102,4 +102,4 @@ The basic structure of the output files is: |- .pdf |- .pdf \- etc etc -``` \ No newline at end of file +``` diff --git a/SCRAPE_AGIS.SH b/SCRAPE_AGIS.SH new file mode 100755 index 0000000..3910b95 --- /dev/null +++ b/SCRAPE_AGIS.SH @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +echo -e "\n-========================================================================-" +echo -e "-=- -=-" +echo -e "-=- SCRAPE_AGIS.SH: Downloads ArcGIS maps -=-" +echo -e "-=- -=-" +echo -e "-=- Lillian Skinner -=-" +echo -e "-=- -=-" +echo -e "-========================================================================-" + +source ./functions/.functions + +WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" + +ARGIS_URL="https://maps.london.ca/server/rest/services" + +TMP="./tmp" +TMP_STAGING="./tmp/layers" +SERVICELIST_JSON="$TMP/servicelist.json" +FOLDER_JSON="$TMP/folder.json" +SERVICE_JSON="$TMP/service.json" +LAYERQUERY_JSON="$TMP/layer_query.json" + +mkdir "$TMP" +mkdir "$TMP_STAGING" + +wget "$ARGIS_URL?f=json" --user-agent="$WGET_UA" -O "$SERVICELIST_JSON" -q + +jq -r '.folders[]?' "$SERVICELIST_JSON" | while read -r FOLDER; do + wget "$ARGIS_URL/$FOLDER?f=json" --user-agent="$WGET_UA" -O "$FOLDER_JSON" -q + echo "Looking in $FOLDER" + jq -r '.services[] + | select(.type=="MapServer") + | .name' "$FOLDER_JSON" | while read -r SERVICE; do + echo "Found $SERVICE" + SERVICE_PATH="$FOLDER/$SERVICE" + echo "$ARGIS_URL/$SERVICE/MapServer" + wget "$ARGIS_URL/$SERVICE/MapServer?f=json" --user-agent="$WGET_UA" -O "$SERVICE_JSON" -q + + mkdir -p "LondonArchive/ArcGIS/${SERVICE}" + jq -r '.layers[]? | "\(.id)|\(.name)"' "$SERVICE_JSON" | while IFS='|' read -r LAYERID LAYERNAME; do + rm -r "$TMP_STAGING" + mkdir "$TMP_STAGING" + + LAYERNAME_CLEAN=$(echo $LAYERNAME | sed 's/\// /g' | sed 's/\\/ /g' | sed -E 's/ {2,}/ /g') + + curl -s "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&returnCountOnly=true&f=json" -o "$TMP/count.json" + ITEM_COUNT=$(jq -r '.count' "$TMP/count.json") + MAX_REQUESTS=2000 + i=0 + j=0 + + while (( i <= ITEM_COUNT )); do + echo "Downloading $LAYERID-${j} $LAYERNAME_CLEAN" + echo "$i of $ITEM_COUNT" + + _utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=geojson" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.geojson" + echo "Done GeoJSON!" + _utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=kmz" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.kmz" + echo "Done KMZ!" + i=$(( i + MAX_REQUESTS )) + ((j++)) + done + 7z a "LondonArchive/ArcGIS/${SERVICE}/Layer ${LAYERID} - ${LAYERNAME_CLEAN}.7z" "$TMP_STAGING" + done + done +done diff --git a/SCRAPE_ESCRIBE.SH b/SCRAPE_ESCRIBE.SH old mode 100644 new mode 100755 index ad97b53..34d3c6d --- a/SCRAPE_ESCRIBE.SH +++ b/SCRAPE_ESCRIBE.SH @@ -38,18 +38,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') - INDEX_END="FALSE" - while [[ $INDEX_END == "FALSE" ]]; do + INDEX_END=0 + while (( ! INDEX_END )); do echo "SCRAPE_ESCRIBE: Downloading eScribe index..." wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress if [ $? -ne 8 ]; then - FOUNDLIST="FALSE" + FOUNDLIST=0 while IFS= read -r LINE; do - if [[ "TRUE" == $FOUNDLIST ]]; then + if (( FOUNDLIST )); then GREPENDLIST=$(echo $LINE | grep '