diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..45ac61f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +LondonArchive/ +LondonScrapers_privdata/ +tmp/ +staging/ diff --git a/README.MD b/README.MD old mode 100644 new mode 100755 index c1cecbc..3591d35 --- a/README.MD +++ b/README.MD @@ -17,7 +17,7 @@ YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS! ## Scrape eScribe meetings (SCRAPE_MEET.SH) -This bash script will scrape meetings from the eScribe meetings platform. +This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST=1` (true), meetings older than 2 months will be downloaded. Otherwise, they will be skipped. The basic structure of the output files is: ``` @@ -76,7 +76,7 @@ The basic structure of the output files is: This bash script will scrape LTC meetings from their wordpress site at: https://www.londontransit.ca/agendas-and-minutes/ -Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low. +Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low. The HTML --> PDF conversion needs the template page included at `./template/default.html`. The basic structure of the output files is: ``` @@ -102,4 +102,4 @@ The basic structure of the output files is: |- .pdf |- .pdf \- etc etc -``` \ No newline at end of file +``` diff --git a/SCRAPE_AGIS.SH b/SCRAPE_AGIS.SH new file mode 100755 index 0000000..3910b95 --- /dev/null +++ b/SCRAPE_AGIS.SH @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +echo -e "\n-========================================================================-" +echo -e "-=- -=-" +echo -e "-=- SCRAPE_AGIS.SH: Downloads ArcGIS maps -=-" +echo -e "-=- -=-" +echo -e "-=- Lillian Skinner -=-" +echo -e "-=- -=-" +echo -e "-========================================================================-" + +source ./functions/.functions + +WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" + +ARGIS_URL="https://maps.london.ca/server/rest/services" + +TMP="./tmp" +TMP_STAGING="./tmp/layers" +SERVICELIST_JSON="$TMP/servicelist.json" +FOLDER_JSON="$TMP/folder.json" +SERVICE_JSON="$TMP/service.json" +LAYERQUERY_JSON="$TMP/layer_query.json" + +mkdir "$TMP" +mkdir "$TMP_STAGING" + +wget "$ARGIS_URL?f=json" --user-agent="$WGET_UA" -O "$SERVICELIST_JSON" -q + +jq -r '.folders[]?' "$SERVICELIST_JSON" | while read -r FOLDER; do + wget "$ARGIS_URL/$FOLDER?f=json" --user-agent="$WGET_UA" -O "$FOLDER_JSON" -q + echo "Looking in $FOLDER" + jq -r '.services[] + | select(.type=="MapServer") + | .name' "$FOLDER_JSON" | while read -r SERVICE; do + echo "Found $SERVICE" + SERVICE_PATH="$FOLDER/$SERVICE" + echo "$ARGIS_URL/$SERVICE/MapServer" + wget "$ARGIS_URL/$SERVICE/MapServer?f=json" --user-agent="$WGET_UA" -O "$SERVICE_JSON" -q + + mkdir -p "LondonArchive/ArcGIS/${SERVICE}" + jq -r '.layers[]? | "\(.id)|\(.name)"' "$SERVICE_JSON" | while IFS='|' read -r LAYERID LAYERNAME; do + rm -r "$TMP_STAGING" + mkdir "$TMP_STAGING" + + LAYERNAME_CLEAN=$(echo $LAYERNAME | sed 's/\// /g' | sed 's/\\/ /g' | sed -E 's/ {2,}/ /g') + + curl -s "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&returnCountOnly=true&f=json" -o "$TMP/count.json" + ITEM_COUNT=$(jq -r '.count' "$TMP/count.json") + MAX_REQUESTS=2000 + i=0 + j=0 + + while (( i <= ITEM_COUNT )); do + echo "Downloading $LAYERID-${j} $LAYERNAME_CLEAN" + echo "$i of $ITEM_COUNT" + + _utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=geojson" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.geojson" + echo "Done GeoJSON!" + _utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=kmz" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.kmz" + echo "Done KMZ!" + i=$(( i + MAX_REQUESTS )) + ((j++)) + done + 7z a "LondonArchive/ArcGIS/${SERVICE}/Layer ${LAYERID} - ${LAYERNAME_CLEAN}.7z" "$TMP_STAGING" + done + done +done diff --git a/SCRAPE_ESCRIBE.SH b/SCRAPE_ESCRIBE.SH old mode 100644 new mode 100755 index ad97b53..34d3c6d --- a/SCRAPE_ESCRIBE.SH +++ b/SCRAPE_ESCRIBE.SH @@ -38,18 +38,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') - INDEX_END="FALSE" - while [[ $INDEX_END == "FALSE" ]]; do + INDEX_END=0 + while (( ! INDEX_END )); do echo "SCRAPE_ESCRIBE: Downloading eScribe index..." wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress if [ $? -ne 8 ]; then - FOUNDLIST="FALSE" + FOUNDLIST=0 while IFS= read -r LINE; do - if [[ "TRUE" == $FOUNDLIST ]]; then + if (( FOUNDLIST )); then GREPENDLIST=$(echo $LINE | grep '