From 535e9590541c84b73dc4871cae384f62caa2c00f Mon Sep 17 00:00:00 2001 From: Lillian Skinner Date: Tue, 7 Apr 2026 18:57:47 -0400 Subject: [PATCH] Upload files to "/" --- README.MD | 105 ++++++++++++ SCRAPE_ESCRIBE.SH | 99 +++++++++++ SCRAPE_LPS.SH | 76 +++++++++ SCRAPE_LTC.SH | 184 ++++++++++++++++++++ SCRAPE_MEET.SH | 423 ++++++++++++++++++++++++++++++++++++++++++++++ SCRAPE_OPEN.SH | 98 +++++++++++ SCRAPE_PLAN.SH | 351 ++++++++++++++++++++++++++++++++++++++ websites.csv | 34 ++++ 8 files changed, 1370 insertions(+) create mode 100644 README.MD create mode 100644 SCRAPE_ESCRIBE.SH create mode 100644 SCRAPE_LPS.SH create mode 100644 SCRAPE_LTC.SH create mode 100644 SCRAPE_MEET.SH create mode 100644 SCRAPE_OPEN.SH create mode 100644 SCRAPE_PLAN.SH create mode 100644 websites.csv diff --git a/README.MD b/README.MD new file mode 100644 index 0000000..c1cecbc --- /dev/null +++ b/README.MD @@ -0,0 +1,105 @@ +# City of London Scrapers +This is a collection of shell script scrapers that I have written for the City of London website. These are meant for my own use, so comments and code quality is lacking. If you need something scraped, or want to understand why/how I'm scraping the city, please reach out by email at "contact@lillianskinner.ca". Cheers. + +## websites.csv + +`websites.csv` holds an index of eScribe domains to crawl. The format is as follows: +``` +"","","" +``` +As an example, an entry might look like this: +``` +"https://pub-london.escribemeetings.com/", "LondonArchive", "" +``` +Files will be output to `./LondonArchive/Meetings/`. + +YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS! + +## Scrape eScribe meetings (SCRAPE_MEET.SH) + +This bash script will scrape meetings from the eScribe meetings platform. + +The basic structure of the output files is: +``` +.//Meetings//// + |- .pdf + |- .pdf + \- Attachments/ + |- .pdf + |- .pdf + \- etc etc +``` + +## Scrape eScribe JSONs (SCRAPE_ESCRIBE.SH) + +This bash script will scrape meeting JSON lists from the eScribe meetings platform. Each JSON will be split into batches of 50 meetings. + +The basic structure of the output files is: +``` +./output directory in websites.csv/Meetings (JSON)// + |- _0.json + |- _1.json + \- etc etc +``` + +## Scrape planning applications (SCRAPE_PLAN.SH) + +This bash script will scrape planning applications from London's website at: https://london.ca/business-development/planning-development-applications/planning-applications + +The basic structure of the output files is: +``` +./LondonArchive/Planning Applications// + \- - 123 Example St/ + |- Info.txt + \- Attachments/ + |- .pdf + |- .pdf + \- etc etc +``` + +## Scrape London open data (SCRAPE_OPEN.SH) + +This bash script will scrape London's ArcGIS open data platform, including maps and statistics. The server is at: https://maps.london.ca/server/rest/services/OpenData + +The basic structure of the output files is: +``` +./LondonArchive_OpenData/ + |- .xlsx.7z + |- .csv.7z + \- Maps/ + |- .7z + |- .7z + \- etc etc +``` + +## Scrape London Transit Commission meetings (SCRAPE_LTC.SH) + +This bash script will scrape LTC meetings from their wordpress site at: https://www.londontransit.ca/agendas-and-minutes/ + +Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low. + +The basic structure of the output files is: +``` +./LondonArchive/LTC//// + |- .pdf + |- .pdf + \- Attachments/ + |- .pdf + |- .pdf + \- etc etc +``` + +## Scrape London Police Services meetings (SCRAPE_LPS.SH) + +This bash script will scrape LPS meetings from their wordpress site at: https://londonpoliceserviceboard.com/board-meetings/ + +The basic structure of the output files is: +``` +./LondonArchive/LPS//// + |- .pdf + |- .pdf + \- Attachments/ + |- .pdf + |- .pdf + \- etc etc +``` \ No newline at end of file diff --git a/SCRAPE_ESCRIBE.SH b/SCRAPE_ESCRIBE.SH new file mode 100644 index 0000000..ad97b53 --- /dev/null +++ b/SCRAPE_ESCRIBE.SH @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +echo -e "\n-========================================================================-" +echo -e "-=- -=-" +echo -e "-=- SCRAPE_ESCRIBE.SH: Download eScribe meetings JSONs -=-" +echo -e "-=- -=-" +echo -e "-=- Lillian Skinner -=-" +echo -e "-=- -=-" +echo -e "-========================================================================-" + +# Warning to all who read this script: +# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works. + +# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! +WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" + +TEMP_DIR="./tmp/" +INDEX_PAGE="./tmp/index_cal.html" +SEARCH_PAGE="./tmp/search.html" +AGENDA_HTML="./tmp/work.html" +ADDENDUM_HTML="./tmp/addendum.html" +#VIDEO_TIMESTAMP_JSON="./tmp/time_cal.json" + +current_year=$(date +%Y) +current_month=$(date +%m) +current_day=$(date +%d) + +if [ -d "$TEMP_DIR" ]; then + rm -r $TEMP_DIR +fi +rm -f $INDEX_PAGE +rm -f $SEARCH_PAGE +rm -f $AGENDA_HTML + +mkdir $TEMP_DIR + +while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do + INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g') + CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') + CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') + + INDEX_END="FALSE" + while [[ $INDEX_END == "FALSE" ]]; do + echo "SCRAPE_ESCRIBE: Downloading eScribe index..." + wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress + if [ $? -ne 8 ]; then + FOUNDLIST="FALSE" + while IFS= read -r LINE; do + if [[ "TRUE" == $FOUNDLIST ]]; then + GREPENDLIST=$(echo $LINE | grep '