Upload files to "/"

This commit is contained in:
Lillian Skinner 2026-04-07 18:57:47 -04:00
commit 535e959054
8 changed files with 1370 additions and 0 deletions

105
README.MD Normal file
View File

@ -0,0 +1,105 @@
# City of London Scrapers
This is a collection of shell script scrapers that I have written for the City of London website. These are meant for my own use, so comments and code quality is lacking. If you need something scraped, or want to understand why/how I'm scraping the city, please reach out by email at "contact@lillianskinner.ca". Cheers.
## websites.csv
`websites.csv` holds an index of eScribe domains to crawl. The format is as follows:
```
"<eScribe domain>","<output directory>","<leave empty, this entry is used by other tools>"
```
As an example, an entry might look like this:
```
"https://pub-london.escribemeetings.com/", "LondonArchive", ""
```
Files will be output to `./LondonArchive/Meetings/`.
YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS!
## Scrape eScribe meetings (SCRAPE_MEET.SH)
This bash script will scrape meetings from the eScribe meetings platform.
The basic structure of the output files is:
```
./<output directory in websites.csv>/Meetings/<board/committee name>/<year>/<mm-dd>/
|- <agenda>.pdf
|- <minutes>.pdf
\- Attachments/
|- <attachment 1>.pdf
|- <attachment 2>.pdf
\- etc etc
```
## Scrape eScribe JSONs (SCRAPE_ESCRIBE.SH)
This bash script will scrape meeting JSON lists from the eScribe meetings platform. Each JSON will be split into batches of 50 meetings.
The basic structure of the output files is:
```
./output directory in websites.csv/Meetings (JSON)/<board/committee name>/
|- <board/committee name>_0.json
|- <board/committee name>_1.json
\- etc etc
```
## Scrape planning applications (SCRAPE_PLAN.SH)
This bash script will scrape planning applications from London's website at: https://london.ca/business-development/planning-development-applications/planning-applications
The basic structure of the output files is:
```
./LondonArchive/Planning Applications/<application type>/
\- <file no.> - 123 Example St/
|- Info.txt
\- Attachments/
|- <attachment 1>.pdf
|- <attachment 2>.pdf
\- etc etc
```
## Scrape London open data (SCRAPE_OPEN.SH)
This bash script will scrape London's ArcGIS open data platform, including maps and statistics. The server is at: https://maps.london.ca/server/rest/services/OpenData
The basic structure of the output files is:
```
./LondonArchive_OpenData/
|- <statistics 1>.xlsx.7z
|- <statistics 2>.csv.7z
\- Maps/
|- <map 1>.7z
|- <map 2>.7z
\- etc etc
```
## Scrape London Transit Commission meetings (SCRAPE_LTC.SH)
This bash script will scrape LTC meetings from their wordpress site at: https://www.londontransit.ca/agendas-and-minutes/
Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low.
The basic structure of the output files is:
```
./LondonArchive/LTC/<board/committee name>/<year>/<mm-dd>/
|- <agenda>.pdf
|- <minutes>.pdf
\- Attachments/
|- <attachment 1>.pdf
|- <attachment 2>.pdf
\- etc etc
```
## Scrape London Police Services meetings (SCRAPE_LPS.SH)
This bash script will scrape LPS meetings from their wordpress site at: https://londonpoliceserviceboard.com/board-meetings/
The basic structure of the output files is:
```
./LondonArchive/LPS/<board/committee name>/<year>/<mm-dd>/
|- <agenda>.pdf
|- <minutes>.pdf
\- Attachments/
|- <attachment 1>.pdf
|- <attachment 2>.pdf
\- etc etc
```

99
SCRAPE_ESCRIBE.SH Normal file
View File

@ -0,0 +1,99 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_ESCRIBE.SH: Download eScribe meetings JSONs -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
# Warning to all who read this script:
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
INDEX_PAGE="./tmp/index_cal.html"
SEARCH_PAGE="./tmp/search.html"
AGENDA_HTML="./tmp/work.html"
ADDENDUM_HTML="./tmp/addendum.html"
#VIDEO_TIMESTAMP_JSON="./tmp/time_cal.json"
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
if [ -d "$TEMP_DIR" ]; then
rm -r $TEMP_DIR
fi
rm -f $INDEX_PAGE
rm -f $SEARCH_PAGE
rm -f $AGENDA_HTML
mkdir $TEMP_DIR
while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g')
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
INDEX_END="FALSE"
while [[ $INDEX_END == "FALSE" ]]; do
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
if [ $? -ne 8 ]; then
FOUNDLIST="FALSE"
while IFS= read -r LINE; do
if [[ "TRUE" == $FOUNDLIST ]]; then
GREPENDLIST=$(echo $LINE | grep '<option ')
if [[ "$GREPENDLIST" == "" ]]; then
echo "SCRAPE_ESCRIBE: End of list."
INDEX_END="TRUE"
break
else
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
echo "-========================================================================-"
echo "- "$MEETING_NAME
# Pages start at 1. Ew.
x=1
mkdir "${CITY_ARCHIVE_NAME}"
mkdir "${CITY_ARCHIVE_NAME}/Meetings (JSON)/"
mkdir "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}"
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json"
y=0
i=0
NUM_MEETINGS=$(cat "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json" | jq '.d.TotalCount')
while (true); do
NUM_IN_JSON=$(cat "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json" | jq '.d.Meetings | length' )
# Decrease in the meeting count == we're on the final page.
if (( $i >= $NUM_IN_JSON )) && (( $NUM_IN_JSON >= 50)); then
((x++))
i=0
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json"
elif (( $i >= $NUM_IN_JSON )); then
break
fi
#echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
((i++))
((y++))
done
fi
fi
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
if [[ "$GREPLIST" != "" ]]; then
echo "SCRAPE_ESCRIBE: Found meeting type list."
FOUNDLIST="TRUE"
fi
done < $INDEX_PAGE
else
INDEX_END="TRUE"
echo "SCRAPE_ESCRIBE: Couldn't save index!"
fi
done
done < websites.csv

76
SCRAPE_LPS.SH Normal file
View File

@ -0,0 +1,76 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-"
echo -e "-=- -=-"
echo -e "-=- https://gist.github.com/rvtr/******************************** -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
conv_date() {
echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
}
MEETINGS_PAGE="./tmp.html"
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
mkdir "./LondonArchive"
mkdir "./LondonArchive/LPS"
mkdir "./tmp"
wget --user-agent="$WGET_UA" "https://londonpoliceserviceboard.com/board-meetings/" -O "./tmp/index.html" -q #--show-progress
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
# If I don't set these values then "10#: invalid integer constant"
MEETING_YEAR="0000"
MEETING_MONTH="00"
MEETING_DAY="00"
while IFS= read -r LINE_PRE; do
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
FOUND_ATTACH_TYPE="$(echo $LINE | grep '<h3 class="fusion-title-heading' | sed 's/.*<h3[^>]*>\([^<]*\)<[\/:-]h3>.*/\1/')"
FOUND_YEAR_HEADER="$(echo $LINE | grep 'tabindex="0" aria-labelledby="fusion-tab-' | sed 's/.*aria-labelledby="\([^"]*\)".*/\1/' | sed 's/.*fusion-tab-//')"
if [[ "$FOUND_YEAR_HEADER" != "" ]]; then
echo $FOUND_ATTACH_TYPE
echo $FOUND_YEAR_HEADER
YEAR="$FOUND_YEAR_HEADER"
if [[ "$FOUND_ATTACH_TYPE" == "Meeting Minutes" ]]; then
ATTACH_TYPE="Minutes"
elif [[ "$FOUND_ATTACH_TYPE" == "Agenda and Report Packages" ]]; then
ATTACH_TYPE="Agenda"
fi
fi
FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">')
if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then
conv_date "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
echo "$MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')"
mkdir -p "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/"
wget "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" -O "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/$ATTACH_TYPE.pdf" -q
fi
done < "./tmp/index.html"

184
SCRAPE_LTC.SH Normal file
View File

@ -0,0 +1,184 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_LTC.SH: Downloads LTC committee agendas and minutes -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
MEETINGS_PAGE="./tmp.html"
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
mkdir "./LondonArchive"
mkdir "./LondonArchive/LTC"
mkdir "./tmp"
wget --user-agent="$WGET_UA" "https://www.londontransit.ca/agendas-and-minutes/" -O "./tmp/index.html" -q #--show-progress
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
# If I don't set these values then "10#: invalid integer constant"
MEETING_YEAR="0000"
MEETING_MONTH="00"
MEETING_DAY="00"
while IFS= read -r LINE_PRE; do
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
# Only <h2> without a class is the title of a committee. We'll confirm it is the title by checking for "Agendas and Minutes".
GREPCOMMITTEE=$(echo $LINE | grep "<h2>" | grep "Agendas and Minutes")
GREPDATE=$(echo $LINE | grep "</strong></td>")
if [[ "$GREPCOMMITTEE" != "" ]]; then
COMMITTEENAME=$(echo $LINE | sed 's/<h2>//' | sed 's/<\/h2>//')
echo "NEW COMMITTEE"
echo "$COMMITTEENAME"
if [[ "$(echo "$COMMITTEENAME" | grep "APTSAC")" != "" ]]; then
echo "Committee slug set"
COMMITTEENAME_SLUG="Accessible Public Transit Services Advisory Committee"
elif [[ "$(echo "$COMMITTEENAME" | grep "Commission")" != "" ]]; then
echo "Committee slug set"
COMMITTEENAME_SLUG="Commission"
fi
# Only a marker for a new committee. Do nothing else.
GREPCOMMITTEE=""
elif [[ "$GREPDATE" != "" ]]; then
# Remove HTML junk from date string.
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
MEETING_MONTH_WORD=$(echo "$DATES_CLEAN" | sed -E 's/^([A-Za-z]+) .*/\1/')
MEETING_DAY_SHORT=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/')
MEETING_DAY=$(printf "%02d" ${MEETING_DAY_SHORT#0})
MEETING_YEAR=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
echo " NEW MEETING FOUND"
echo " DATE IS $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
GREPDATE=""
else
# Has a previous meeting has been set? What about a date?
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
if [[ "COMMITTEENAME" != "" ]] && [[ "MEETING_YEAR" != "" ]] && (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
# Not changing meetings, and we know that an old meeting has alread been set. Keep going.
# If match --> make folder --> download
AGENDAURL=$(echo $LINE | grep "PDF Agenda" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
AGENDAHTMLURL=$(echo $LINE | grep "HTML Agenda" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
MINUTESURL=$(echo $LINE | grep "PDF Minutes" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
MINUTESHTMLURL=$(echo $LINE | grep "HTML Minutes" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
# Yes, I know there are HTML agendas. LTC is a lower priority, so I am not able to put a bunch of time into making a parser for the HTML. Sorry.
# Well... this aged well.
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR" 2> /dev/null
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY" 2> /dev/null
if [[ "$AGENDAURL" != "" ]]; then
echo " DOWNLOAD AGENDA PDF"
echo " $AGENDAURL"
wget --user-agent="$WGET_UA" "$AGENDAURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Agenda.pdf" -c -q #--show-progress
elif [[ "$MINUTESURL" != "" ]]; then
echo " DOWNLOAD MINUTES PDF"
echo " $MINUTESURL"
wget --user-agent="$WGET_UA" "$MINUTESURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Minutes.pdf" -c -q #--show-progress
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
if [[ "$AGENDAHTMLURL" != "" ]]; then
echo " DOWNLOAD AGENDA HTML TO CRAWL"
echo " $AGENDAHTMLURL"
wget --user-agent="$WGET_UA" "$AGENDAHTMLURL" -O "./tmp/work.html" -q #--show-progress
elif [[ "$MINUTESHTMLURL" != "" ]]; then
echo " DOWNLOAD MINUTES HTML TO CRAWL"
echo " $MINUTESHTMLURL"
wget --user-agent="$WGET_UA" "$MINUTESHTMLURL" -O "./tmp/work.html" -q #--show-progress
fi
while IFS= read -r LINE_HTML_PRE; do
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
GREPARTICLESTART=$(echo $LINE_HTML | grep "<article")
GREPARTICLEEND=$(echo $LINE_HTML | grep "</article>")
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
if [[ "$GREPARTICLESTART" != "" ]]; then
echo " FOUND INDEX ARTICLE START"
ISARTICLE="TRUE"
elif [[ "$GREPARTICLEEND" != "" ]]; then
echo " END OF INDEX ARTICLE"
ISARTICLE=""
elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
ISPDF=$(echo $GREPLINK | grep "\.pdf")
if [[ "$ISPDF" != "" ]]; then
PDFNAME=$(echo $ISPDF | sed 's/.*\///')
echo " DOWNLOAD ATTACHMENT PDF"
echo " $ISPDF"
wget --user-agent="$WGET_UA" "$ISPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFNAME" -c -q #--show-progress
else
# Extract title of attachment
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&amp;/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
echo " DOWNLOAD ATTACHMENT HTML"
echo " $ATTACHTITLE"
echo " $GREPLINK"
wget --user-agent="$WGET_UA" "$GREPLINK" -O "./tmp/attachment.html" -q #--show-progress
while IFS= read -r LINE_ATTACH_PRE; do
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
GREPATTACHMENTARTICLEEND=$(echo $LINE_ATTACH | grep "</article>")
GREPATTACHMENTLINK=$(echo $LINE_ATTACH | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
if [[ "$GREPATTACHMENTARTICLESTART" != "" ]]; then
echo " FOUND ATTACHMENT ARTICLE START"
# CSS for the HTML is in the default template
cat ./template/default.html > ./tmp/new.html
echo "$LINE_ATTACH" >> ./tmp/new.html
ISATTACHMENTARTICLE="TRUE"
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
echo " END OF ATTACHMENT ARTICLE"
echo "$LINE_ATTACH" >> ./tmp/new.html
echo " PROCESSED TO PDF"
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
ISATTACHMENTARTICLE=""
elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
if [[ "$ISREFPDF" != "" ]]; then
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
echo " $GREPATTACHMENTLINK"
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
fi
elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then
echo "$LINE_ATTACH" >> ./tmp/new.html
fi
LINE_ATTACH=""
GREPATTACHMENTARTICLESTART=""
GREPATTACHMENTARTICLEEND=""
GREPATTACHMENTLINK=""
done < ./tmp/attachment.html
ISPDF=""
fi
fi
LINE_HTML=""
GREPARTICLESTART=""
GREPARTICLEEND=""
GREPLINK=""
done < ./tmp/work.html
fi
AGENDAURL=""
AGENDAHTMLURL=""
MINUTESURL=""
MINUTESHTMLURL=""
fi
fi
fi
done < "./tmp/index.html"

423
SCRAPE_MEET.SH Normal file
View File

@ -0,0 +1,423 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
conv_date() {
echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
}
conv_date_alt() {
echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
MEETING_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed 's/.* //')
case "$MEETING_MONTH_WORD" in
Jan*) MEETING_MONTH="01" ;;
Feb*) MEETING_MONTH="02" ;;
Mar*) MEETING_MONTH="03" ;;
Apr*) MEETING_MONTH="04" ;;
May) MEETING_MONTH="05" ;;
Jun*) MEETING_MONTH="06" ;;
Jul*) MEETING_MONTH="07" ;;
Aug*) MEETING_MONTH="08" ;;
Sep*) MEETING_MONTH="09" ;;
Oct*) MEETING_MONTH="10" ;;
Nov*) MEETING_MONTH="11" ;;
Dec*) MEETING_MONTH="12" ;;
*) MEETING_MONTH="--" ;;
esac
}
set_agenda_url() {
case "$1" in
'"Agenda (HTML)"')
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda (PDF)"')
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (HTML)"')
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Revised Agenda (PDF)"')
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (HTML)"')
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes (PDF)"')
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Minutes with Attachments (PDF)"')
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (HTML)"')
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Full Package (PDF)"')
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (HTML)"')
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Agenda Cover Page (PDF)"')
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (HTML)"')
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Post Agenda (PDF)"')
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (HTML)"')
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
'"Addendum (PDF)"')
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
esac
}
clear_agenda_url() {
AGENDA_HTML_URL=""
AGENDA_PDF_URL=""
AGENDA_REVISE_HTML_URL=""
AGENDA_REVISE_PDF_URL=""
MINUTES_HTML_URL=""
MINUTES_PDF_URL=""
MINUTES_ATTACH_PDF_URL=""
AGENDA_FULL_HTML_URL=""
AGENDA_FULL_PDF_URL=""
AGENDA_COVER_HTML_URL=""
AGENDA_COVER_PDF_URL=""
AGENDA_POST_HTML_URL=""
AGENDA_POST_PDF_URL=""
ADDENDUM_HTML_URL=""
ADDENDUM_PDF_URL=""
}
download_agendas() {
if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then
if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then
echo "Saving revised agenda as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$1/Agenda_Revised.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_PDF_URL != "" ]]; then
echo "Saving regular agenda as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$1/Agenda.pdf" -N -q #--show-progress
fi
elif [[ $AGENDA_REVISE_HTML_URL != "" ]] || [[ $AGENDA_HTML_URL != "" ]]; then
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
echo "Saving revised agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$1/Agenda_Revised.html" -N -q #--show-progress
fi
if [[ $AGENDA_HTML_URL != "" ]]; then
echo "Saving regular agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$1/Agenda.html" -N -q #--show-progress
fi
elif [[ $AGENDA_FULL_PDF_URL != "" ]] || [[ $AGENDA_FULL_HTML_URL != "" ]]; then
if [[ $AGENDA_FULL_PDF_URL != "" ]]; then
echo "Saving full package agenda as PDF... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_PDF_URL" -O "$1/Agenda_FullPackage.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_FULL_HTML_URL != "" ]]; then
echo "Saving full package agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O "$1/Agenda_FullPackage.html" -N -q #--show-progress
fi
elif [[ $AGENDA_POST_PDF_URL != "" ]] || [[ $AGENDA_POST_HTML_URL != "" ]]; then
if [[ $AGENDA_POST_PDF_URL != "" ]]; then
echo "Saving post agenda as HTML... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_PDF_URL" -O "$1/Agenda_Post.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_POST_HTML_URL != "" ]]; then
echo "Saving post agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O "$1/Agenda_Post.html" -N -q #--show-progress
fi
fi
if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then
if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then
echo "Saving minutes with attachments as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$1/Minutes_With_Attachments.pdf" -N -q #--show-progress
fi
if [[ $MINUTES_PDF_URL != "" ]]; then
echo "Saving minutes as PDF..."
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$1/Minutes.pdf" -N -q #--show-progress
fi
else
if [[ $MINUTES_HTML_URL != "" ]]; then
echo "Saving minutes as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$1/Minutes.html" -N -q #--show-progress
fi
fi
if [[ $AGENDA_COVER_PDF_URL != "" ]]; then
echo "Saving cover agenda as PDF... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_PDF_URL" -O "$1/Agenda_Cover.pdf" -N -q #--show-progress
fi
if [[ $AGENDA_COVER_HTML_URL != "" ]]; then
echo "Saving cover agenda as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O "$1/Agenda_Cover.html" -N -q #--show-progress
fi
if [[ $ADDENDUM_PDF_URL != "" ]]; then
echo "Saving addendum as PDF... (no HTML found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_PDF_URL" -O "$1/Addendum.pdf" -N -q #--show-progress
fi
if [[ $ADDENDUM_HTML_URL != "" ]]; then
echo "Saving addendum as HTML... (no PDF found!)"
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O "$1/Addendum.html" -N -q #--show-progress
fi
}
# Warning to all who read this script:
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
INDEX_PAGE="./tmp/index.html"
SEARCH_PAGE="./tmp/search.html"
AGENDA_HTML="./tmp/work.html"
ADDENDUM_HTML="./tmp/addendum.html"
#VIDEO_TIMESTAMP_JSON="./tmp/time.json"
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)00
SUPPORT_PAST="FALSE"
if [ -d "$TEMP_DIR" ]; then
rm -r $TEMP_DIR
fi
rm -f $INDEX_PAGE
rm -f $SEARCH_PAGE
rm -f $AGENDA_HTML
mkdir $TEMP_DIR
while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g')
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
INDEX_END="FALSE"
while [[ $INDEX_END == "FALSE" ]]; do
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
if [ $? -ne 8 ]; then
FOUNDLIST="FALSE"
while IFS= read -r LINE; do
if [[ "TRUE" == $FOUNDLIST ]]; then
GREPENDLIST=$(echo $LINE | grep '<option ')
if [[ "$GREPENDLIST" == "" ]]; then
echo "SCRAPE_ESCRIBE: End of list."
INDEX_END="TRUE"
break
else
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
echo "-========================================================================-"
echo "- $MEETING_NAME"
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
echo "- Corrected to: $MEETING_NAME"
fi
# Pages start at 1. Ew.
x=1
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
#cat "${TEMP_DIR}escribe.json" > debug.json
y=0
i=0
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
while (true); do
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length' )
if [[ "$NUM_IN_JSON" == "" ]]; then
break
fi
# Decrease in the meeting count == we're on the final page.
if (( $i >= $NUM_IN_JSON )) && (( 10#$NUM_IN_JSON >= 50)); then
((x++))
i=0
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
elif (( $i >= 10#$NUM_IN_JSON )); then
break
fi
echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
# No need to cat the entire file every time.
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' > "${TEMP_DIR}escribe_short.json"
#echo "> Meeting ID"
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
#echo "> Meeting Attachments"
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
clear_agenda_url
for ((j=0; j<=(( $NUM_ATTACHMENTS - 1 )); j++)); do
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
done
# "25 Feb 2026"
if [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
echo "Alternate date format."
conv_date_alt "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
# "Feb 25 2026"
elif [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
echo "Standard date format."
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
else
echo "COULD NOT FIGURE OUT DATE FORMAT!"
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
fi
INPAST=""
if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
echo "NAME : $MEETING_NAME"
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
echo "A (H) : $AGENDA_HTML_URL"
echo "A (P) : $AGENDA_PDF_URL"
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
echo "AF(H) : $AGENDA_FULL_HTML_URL"
echo "AF(P) : $AGENDA_FULL_PDF_URL"
echo "AC(H) : $AGENDA_COVER_HTML_URL"
echo "AC(P) : $AGENDA_COVER_PDF_URL"
echo "AP(H) : $AGENDA_POST_HTML_URL"
echo "AP(P) : $AGENDA_POST_PDF_URL"
echo "M (H) : $MINUTES_HTML_URL"
echo "M (P) : $MINUTES_PDF_URL"
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
echo "AD(H) : $ADDENDUM_HTML_URL"
echo "AD(P) : $ADDENDUM_PDF_URL"
else
echo "Dates are in the past!"
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
INPAST="TRUE"
fi
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
echo "Abort."
break
fi
#echo "> Meeting Video"
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
ERROR="FALSE"
ADDENDUM_ERROR="FALSE"
echo "Downloading agenda HTML..."
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_FULL_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_POST_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O $AGENDA_HTML -q #--show-progress
elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
else
ERROR="TRUE"
fi
if [[ $ADDENDUM_HTML_URL != "" ]]; then
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress
else
ADDENDUM_ERROR="TRUE"
fi
if [[ "$ERROR" == "FALSE" ]]; then
mkdir "./$CITY_ARCHIVE_NAME"
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
fi
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR/"
fi
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY")
if [ ! -d "$MEETING_DIR" ]; then
mkdir "$MEETING_DIR/"
fi
if [ ! -d "$MEETING_DIR/Attachments" ]; then
mkdir "$MEETING_DIR/Attachments/"
fi
if [[ $VIDEO_URL != "" ]]; then
echo "Saving recording URL..."
echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt"
fi
# Get attachment links
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
# Get attachment names
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
# Get attachment links
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
# Get attachment names
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
fi
# Download attachment and use the name grabbed above
echo "Found the following agenda attachments:"
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
echo "- $LINEA2"
wget --no-check-certificate --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -N -q #--show-progress
done < ./tmp/attachment_urls 3< ./tmp/attachment_names
echo "All attachments saved."
download_agendas "$MEETING_DIR"
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
echo "dir not empty" >> /dev/null
else
rm -r "$MEETING_DIR/Attachments"
fi
echo "All files from this meeting have been saved."
fi
((i++))
((y++))
done
fi
fi
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
if [[ "$GREPLIST" != "" ]]; then
echo "SCRAPE_ESCRIBE: Found meeting type list."
FOUNDLIST="TRUE"
fi
done < $INDEX_PAGE
else
INDEX_END="TRUE"
echo "SCRAPE_ESCRIBE: Couldn't save index!"
fi
done
done < websites.csv

98
SCRAPE_OPEN.SH Normal file
View File

@ -0,0 +1,98 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_OPENDATA.SH: Scrape Open Data from the City of London -=-"
echo -e "-=- -=-"
echo -e "-=- https://gist.github.com/rvtr/******************************** -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
WORKDIR="./tmp"
STAGEDIR="./staging"
DOCDIR="./LondonArchive_OpenData"
MAPDIR="./LondonArchive_OpenData/Maps"
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
mkdir $WORKDIR
mkdir $DOCDIR
mkdir $MAPDIR
i=0
SEARCH_END=0
while [[ $SEARCH_END == 0 ]]; do
if ((i == 0)); then
OFFSET=""
else
OFFSET="startindex=$((i * 100))"
fi
echo "Start index download..."
curl --get \
--data-urlencode "filter=((group IN (de724381536540a5bf2d005fb32ec92a, d17e3e9bfd274e88aeed15fa165bf1e3, b7ab05d332c24dd2ba485acd2ac92837, b15cf62bc0a14990a75e348930b0cb4e)))" \
--data-urlencode "limit=100" \
--data-urlencode "$OFFSET" \
"https://hub.arcgis.com/api/search/v1/collections/all/items" \
| jq > $WORKDIR/arcgis_list.json
TOTAL_ITEMS=$(jq .numberMatched $WORKDIR/arcgis_list.json)
RETURNED_ITEMS=$(jq .numberReturned $WORKDIR/arcgis_list.json)
echo "Total items in JSON : $TOTAL_ITEMS"
echo "Returned items : $RETURNED_ITEMS"
for (( j=0; j<=$((RETURNED_ITEMS - 1)); j++ )); do
ITEM_ID=$(jq .features[$j]\ .id $WORKDIR/arcgis_list.json | sed 's/\"//g')
ITEM_TITLE=$(jq .features[$j]\ .properties\ .title $WORKDIR/arcgis_list.json | sed 's/\"//g')
ITEM_URL=$(jq .features[$j]\ .properties\ .url $WORKDIR/arcgis_list.json | sed 's/\"//g')
ITEM_NAME=$(jq .features[$j]\ .properties\ .name $WORKDIR/arcgis_list.json | sed 's/\"//g')
echo "Cur. article: $i.$j, ID : $ITEM_ID"
echo " Cur. article: $i.$j, Title: $ITEM_TITLE"
echo " Cur. article: $i.$j, URL : $ITEM_URL"
echo " Cur. article: $i.$j, Name : $ITEM_NAME"
rm -rf $STAGEDIR
mkdir $STAGEDIR
if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$STAGEDIR/$ITEM_NAME" -c -q
echo " Downloaded."
echo "Compressing."
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR"
fi
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]]; then
MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')"
echo " ^^^ Item is map. ($MAP_ID) "
# https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1
# KML and GeoJSON use the spatial ID of 4326, all others use 26917
MAP_CSV="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=csv&spatialRefId=26917&where=1=1"
MAP_SHP="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=shp&spatialRefId=26917&where=1=1"
MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1"
MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1"
echo " Map URL (CSV) : $MAP_CSV"
wget --user-agent="$WGET_UA" "$MAP_CSV" -O "$STAGEDIR/$ITEM_TITLE.csv" -c -q
echo " Downloaded."
echo " Map URL (Shapefile): $MAP_SHP"
wget --user-agent="$WGET_UA" "$MAP_SHP" -O "$STAGEDIR/$ITEM_TITLE.shp" -c -q
echo " Downloaded."
echo " Map URL (GeoJSON) : $MAP_GEO"
wget --user-agent="$WGET_UA" "$MAP_GEO" -O "$STAGEDIR/$ITEM_TITLE.geojson" -c -q
echo " Downloaded."
echo " Map URL (KML) : $MAP_KML"
wget --user-agent="$WGET_UA" "$MAP_KML" -O "$STAGEDIR/$ITEM_TITLE.kml" -c -q
echo " Downloaded."
echo ' Source URL is $ITEM_URL.'
echo "Compressing."
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$MAPDIR/$ITEM_TITLE.7z" "$STAGEDIR"
fi
done
if (( ($((i * 100)) + j) >= TOTAL_ITEMS)); then
echo "No more items!"
SEARCH_END=1
break
fi
((i++))
done

351
SCRAPE_PLAN.SH Normal file
View File

@ -0,0 +1,351 @@
#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_PLANAPPS.SH: Downloads planning applications -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
conv_date_plan() {
PROJECT_TIME_YEAR=$(echo $1 | sed 's/.*\([0-9]\{4\}\).*/\1/p' | uniq)
PROJECT_TIME_MONTH_WORD=$(echo $1 | sed 's/.*,\s*\([A-Za-z]*\)\s[0-9]\{1,2\},.*/\1/p' | uniq)
PROJECT_TIME_DAY_SHORT=$(echo $1 | sed 's/.*,\s*[A-Za-z]*\s\([0-9]\{1,2\}\),.*/\1/p' | uniq)
PROJECT_TIME_DAY=$(printf "%02d" $PROJECT_TIME_DAY_SHORT)
case "$PROJECT_TIME_MONTH_WORD" in
Jan*) PROJECT_TIME_MONTH="01" ;;
Feb*) PROJECT_TIME_MONTH="02" ;;
Mar*) PROJECT_TIME_MONTH="03" ;;
Apr*) PROJECT_TIME_MONTH="04" ;;
May) PROJECT_TIME_MONTH="05" ;;
Jun*) PROJECT_TIME_MONTH="06" ;;
Jul*) PROJECT_TIME_MONTH="07" ;;
Aug*) PROJECT_TIME_MONTH="08" ;;
Sep*) PROJECT_TIME_MONTH="09" ;;
Oct*) PROJECT_TIME_MONTH="10" ;;
Nov*) PROJECT_TIME_MONTH="11" ;;
Dec*) PROJECT_TIME_MONTH="12" ;;
*) PROJECT_TIME_MONTH="--" ;;
esac
}
conv_date() {
MODIFIED_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MODIFIED_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MODIFIED_DAY=$(printf "%02d" $MODIFIED_DAY_SHORT)
MODIFIED_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MODIFIED_MONTH_WORD" in
Jan*) MODIFIED_MONTH="01" ;;
Feb*) MODIFIED_MONTH="02" ;;
Mar*) MODIFIED_MONTH="03" ;;
Apr*) MODIFIED_MONTH="04" ;;
May) MODIFIED_MONTH="05" ;;
Jun*) MODIFIED_MONTH="06" ;;
Jul*) MODIFIED_MONTH="07" ;;
Aug*) MODIFIED_MONTH="08" ;;
Sep*) MODIFIED_MONTH="09" ;;
Oct*) MODIFIED_MONTH="10" ;;
Nov*) MODIFIED_MONTH="11" ;;
Dec*) MODIFIED_MONTH="12" ;;
*) MODIFIED_MONTH="--" ;;
esac
}
# Warning to all who read this script:
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_pa.html"
PROJECT_PAGE="./tmp/work_pa.html"
PROJECT_INFO="./tmp/info.txt"
PROJECT_ATTACH_NAMES="./tmp/names.txt"
PROJECT_ATTACH_URLS="./tmp/urls.txt"
PROJECT_IMAGE_NAMES="./tmp/image-names.txt"
PROJECT_IMAGE_URLS="./tmp/image-urls.txt"
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
#if [ -d "$TEMP_DIR" ]; then
# rm -r $TEMP_DIR
#fi
rm -f $SEARCH_PAGE
rm -f $PROJECT_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
j=0
SEARCH_END="FALSE"
while [[ $SEARCH_END == "FALSE" ]]; do
echo "-========================================================================-"
echo "Downloading search results... Page $j"
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
PAGE_HAS_APPS=$(cat $SEARCH_PAGE | grep "teaser__title")
if [[ "$PAGE_HAS_APPS" != "" ]]; then
while IFS= read -r LINE; do
rm -f $PROJECT_INFO
PAGE_FOUND_APP=$(echo $LINE | grep "teaser__title")
if [[ "$PAGE_FOUND_APP" != "" ]]; then
echo "-========================================================================-"
echo "Task starting on: $(date)"
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq)
PROJECT_URL=$(echo "https://london.ca"$PROJECT_URL)
echo "Downloading page..."
wget --user-agent="$WGET_UA" $PROJECT_URL -O $PROJECT_PAGE --timestamping -q #--show-progress
# Removing COVID is due to the naming in the 2020s. Keeping it for revisiting wayback crawls.
PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&amp;/\&/g' | sed 's/&#039;/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-')
echo " Found project: $PROJECT_NAME"
MODIFIED_MONTH=""
MODIFIED_YEAR=""
conv_date "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
if (( 10#$MODIFIED_YEAR >= 10#$current_year )) && (( 10#$MODIFIED_MONTH >= $((10#$current_month - 1)) )); then
echo "Last Modified: $MODIFIED_YEAR/$MODIFIED_MONTH/$MODIFIED_DAY"
else
echo "Dates are in the past! Abort."
break
fi
echo "Finding attachments..."
rm -f $PROJECT_ATTACH_URLS
rm -f $PROJECT_ATTACH_NAMES
rm -f $PROJECT_IMAGE_URLS
rm -f $PROJECT_IMAGE_NAMES
while IFS= read -r PLINE; do
if [[ "$NEXT_LINE_FITEM" == "TRUE" ]]; then
NEXT_LINE_FITEM="FALSE"
# Is this line an actual item?
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
# Is this line bad data (usually scripts)?
PROJECT_INFO_IS_BAD=$(echo $PLINE | grep "</script>")
# Gotta add in the &s and 's.
PROJECT_INFO_ITEM=$(echo $PLINE | sed 's/.*<div class="field__item">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | sed 's/&amp;/\&/g' | sed 's/&#039;/'\''/g' | uniq)
if [[ $PROJECT_INFO_IS_ITEMS == "" ]] && [[ $PROJECT_INFO_IS_BAD == "" ]] && [[ $PROJECT_INFO_ITEM != "" ]]; then
# We'll check to see if a non-info item made it in. Sometimes attachments will get caught, but can be detected by "visually-hidden"
PROJECT_INFO_LABEL_BAD=$(echo $PROJECT_INFO_LABEL | grep "visually-hidden")
if [[ $PROJECT_INFO_LABEL_BAD == "" ]]; then
printf "%-17s: %s\n" "$PROJECT_INFO_LABEL" "$PROJECT_INFO_ITEM" >> $PROJECT_INFO
if [[ $PROJECT_INFO_LABEL == "File Number" ]]; then
PROJECT_FILE_NUM_2=""
PROJECT_FILE_NUM_TYPE_2=""
PROJECT_FILE_NUM_IS_MULTI=""
# Multiple file numbers may be listed. We will always use the first one as it is contained in PDF names.
# I think it takes priority. Anyways, here are the possible formats:
# XX-#####
# XX-#####/XX-#####
# XX-##### / XX-#####
# XX-##### and XX-#####
# XX-##### & XX-#####
#
# I think the city is allergic to standardization...
PROJECT_FILE_NUM_IS_MULTI=$(echo $PROJECT_INFO_ITEM | grep -e "and" -e "/" -e "&")
PROJECT_FILE_NUM=$(echo $PROJECT_INFO_ITEM | sed 's|/.*||' | sed 's| and .*||' | sed 's| & .*||' | sed 's/^[[:space:]]*//g' | sed 's/[[:space:]]*$//g' | uniq)
PROJECT_FILE_NUM_TYPE=$(echo "$PROJECT_FILE_NUM" | sed 's/^\([^-]*\)-.*$/\1/')
if [[ "$PROJECT_FILE_NUM_TYPE" == "Line of Sight" ]]; then
PROJECT_FILE_TYPE="Line of Sight"
elif [[ "$PROJECT_FILE_NUM_TYPE" == "O" ]]; then
PROJECT_FILE_TYPE="Official Plan Amendment"
elif [[ "$PROJECT_FILE_NUM_TYPE" == "Z" ]]; then
PROJECT_FILE_TYPE="Zoning By-law Amendment"
elif [[ "$PROJECT_FILE_NUM_TYPE" == "OZ" ]]; then
PROJECT_FILE_TYPE="Official Plan and Zoning By-law Amendment"
elif [[ "$PROJECT_FILE_NUM_TYPE" == "TZ" ]]; then
PROJECT_FILE_TYPE="Temporary Zoning By-law Amendment"
elif [[ "$PROJECT_FILE_NUM_TYPE" == "39T" ]]; then
PROJECT_FILE_TYPE="Draft Plan of Subdivision"
elif [[ "$PROJECT_FILE_NUM_TYPE" == "39CD" ]]; then
PROJECT_FILE_TYPE="Draft Plan of Condominium"
elif [[ "$PROJECT_FILE_NUM_TYPE" =~ ^SPA2[0-9]+$ ]]; then
PROJECT_FILE_TYPE="Site Plan Control Application"
elif [[ "$PROJECT_FILE_NUM_TYPE" == "M" ]]; then
PROJECT_FILE_TYPE="Minor Zoning By-law Amendment"
elif [[ "$PROJECT_FILE_NUM_TYPE" == "H" ]]; then
PROJECT_FILE_TYPE="Holding Provision By-law Amendment"
else
PROJECT_FILE_TYPE="BAD RECORD TYPE"
fi
echo "Found file# : $PROJECT_FILE_NUM ($PROJECT_FILE_TYPE)"
if [[ "$PROJECT_FILE_NUM_IS_MULTI" != "" ]]; then
# It isn't great, but if a project has 2 file numbers then we'll save it as both.
# I'm not sure how to get around this since I don't have a way to tag files.
PROJECT_FILE_NUM_2=$(echo $PROJECT_INFO_ITEM | sed 's|.*/||' | sed 's|.* and ||' | sed 's|.* & ||' | sed 's/^[[:space:]]*//g' | sed 's/[[:space:]]*$//g' | uniq)
PROJECT_FILE_NUM_TYPE_2=$(echo "$PROJECT_FILE_NUM_2" | sed 's/^\([^-]*\)-.*$/\1/')
if [[ "$PROJECT_FILE_NUM_TYPE_2" == "Line of Sight" ]]; then
PROJECT_FILE_TYPE_2="Line of Sight"
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "O" ]]; then
PROJECT_FILE_TYPE_2="Official Plan Amendment"
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "Z" ]]; then
PROJECT_FILE_TYPE_2="Zoning By-law Amendment"
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "OZ" ]]; then
PROJECT_FILE_TYPE_2="Official Plan and Zoning By-law Amendment"
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "TZ" ]]; then
PROJECT_FILE_TYPE_2="Temporary Zoning By-law Amendment"
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "39T" ]]; then
PROJECT_FILE_TYPE_2="Draft Plan of Subdivision"
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "39CD" ]]; then
PROJECT_FILE_TYPE_2="Draft Plan of Condominium"
elif [[ "$PROJECT_FILE_NUM_TYPE_2" =~ ^SPA2[0-9]+$ ]]; then
PROJECT_FILE_TYPE_2="Site Plan Control Application"
else
PROJECT_FILE_TYPE_2="BAD RECORD TYPE"
fi
echo "Also filed as: $PROJECT_FILE_NUM_2 ($PROJECT_FILE_TYPE_2)"
fi
fi
PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime")
if [[ $PROJECT_FOUND_TIME != "" ]]; then
conv_date_plan "$PLINE"
echo "Found date : $PROJECT_TIME_YEAR/$PROJECT_TIME_MONTH/$PROJECT_TIME_DAY"
fi
fi
fi
fi
if [[ "$NEXT_LINE_IMAGE" == "TRUE" ]]; then
NEXT_LINE_IMAGE="FALSE"
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
PROJECT_IMAGE_URL=$(echo "https://london.ca"$PROJECT_IMAGE_URL)
fi
PROJECT_IMAGE_NAME=$(echo $PROJECT_IMAGE_URL | sed 's#.*/##p' | uniq)
echo $PROJECT_IMAGE_URL >> $PROJECT_IMAGE_URLS
echo $PROJECT_IMAGE_NAME >> $PROJECT_IMAGE_NAMES
fi
PROJECT_FOUND_FILE=$(echo $PLINE | grep "file--mime-application-")
if [[ $PROJECT_FOUND_FILE != "" ]]; then
PROJECT_ATTACH_URL=$(echo $PLINE | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq)
# Newer links are relative paths, so we must add back the domain
PROJECT_ATTACH_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
if [[ $PROJECT_ATTACH_URL_SHORT == "" ]];then
PROJECT_ATTACH_URL=$(echo "https://london.ca"$PROJECT_ATTACH_URL)
fi
PROJECT_ATTACH_NAME=$(echo $PLINE | sed 's/.*title="\([^"]*\)".*/\1/p' | sed 's/&amp;/\&/g' | sed 's/&#039;/'\''/g' | uniq)
echo $PROJECT_ATTACH_URL >> $PROJECT_ATTACH_URLS
echo $PROJECT_ATTACH_NAME >> $PROJECT_ATTACH_NAMES
fi
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
NEXT_LINE_FITEM="TRUE"
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
# We're setting a flag to let the script know if an upcoming line is contents.
fi
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
NEXT_LINE_IMAGE="TRUE"
# Same idea as before but for the image shown on the main page.
fi
PROJECT_FOUND_EMAIL=$(echo $PLINE | grep "field--name-field-email" | sed 's/.*href="\([^"]*\)".*/\1/p' | sed 's|^mailto:||' | uniq)
if [[ "$PROJECT_FOUND_EMAIL" != "" ]]; then
printf "%-17s: %s\n" "Email" "$PROJECT_FOUND_EMAIL" >> $PROJECT_INFO
fi
PROJECT_FOUND_PLANNER=$(echo $PLINE | grep "field--name-name" | sed 's/.*<div[^>]*>\([^<]*\)<[\/:-]div>.*/\1/p' | uniq)
if [[ "$PROJECT_FOUND_PLANNER" != "" ]]; then
printf "\n%-17s: %s\n" "Planner" "$PROJECT_FOUND_PLANNER" >> $PROJECT_INFO
fi
done < $PROJECT_PAGE
echo "Filing away all the datas..."
mkdir "./LondonArchive"
mkdir "./LondonArchive/Planning Applications"
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE" ]; then
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/"
fi
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE == $PROJECT_FILE_TYPE_2 ]]; then
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME" ]; then
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/"
fi
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" ]; then
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments"
fi
PROJECT_SAVE_PATH="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME"
else
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME" ]; then
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/"
fi
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/Attachments" ]; then
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/Attachments"
fi
PROJECT_SAVE_PATH="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME"
fi
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2" ]; then
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/"
fi
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME" ]; then
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/"
fi
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" ]; then
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments"
fi
PROJECT_SAVE_PATH_2="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME"
fi
echo "Saving attachments:"
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
#echo " - $LINEA1"
echo " - $LINEA2"
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH/Attachments/$LINEA2" --timestamping -q #--show-progress
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH_2/Attachments/$LINEA2" --timestamping -q #--show-progress
fi
done < $PROJECT_ATTACH_URLS 3< $PROJECT_ATTACH_NAMES
echo "All attachments saved."
if [[ "$PROJECT_IMAGE_URL" != "" ]]; then
PROJECT_IMAGE_URL=""
echo "Saving photos:"
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
#echo " - $LINEA1"
echo " - $LINEA2"
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH/$LINEA2" --timestamping -q #--show-progress
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH_2/$LINEA2" --timestamping -q #--show-progress
fi
done < $PROJECT_IMAGE_URLS 3< $PROJECT_IMAGE_NAMES
echo "All photos saved."
fi
echo "Extracted info summary:"
cat $PROJECT_INFO > "$PROJECT_SAVE_PATH/Info.txt"
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
cat $PROJECT_INFO > "$PROJECT_SAVE_PATH_2/Info.txt"
fi
cat $PROJECT_INFO
fi
done < $SEARCH_PAGE
else
SEARCH_END="TRUE"
echo "No more pages!"
fi
else
SEARCH_END="TRUE"
echo "No more pages!"
fi
((j++))
done

34
websites.csv Normal file
View File

@ -0,0 +1,34 @@
"https://pub-brampton.escribemeetings.com/", "SubBramptonArchive", ""
"https://pub-markham.escribemeetings.com/", "SubMarkhamArchive", ""
"https://pub-cityofkingston.escribemeetings.com/", "SubKingstonArchive", ""
"https://pub-barrie.escribemeetings.com/", "SubBarrieArchive", ""
"https://pub-oshawa.escribemeetings.com/", "SubOshawaArchive", ""
"https://pub-ottawa.escribemeetings.com/", "OttawaArchive", ""
"https://pub-owensound.escribemeetings.com/", "SubOwenSoundArchive", ""
"https://pub-goderich.escribemeetings.com/", "SubGoderichArchive", ""
"https://pub-oakville.escribemeetings.com/", "SubOakvilleArchive", ""
"https://burlingtonpublishing.escribemeetings.com/", "SubBurlingtonArchive", ""
"https://pub-milton.escribemeetings.com/", "SubMiltonArchive", ""
"https://pub-durhamregion.escribemeetings.com/", "SubDurhamArchive", ""
"https://pub-richmondhill.escribemeetings.com/", "SubRichmondHillArchive", ""
"https://pub-whitby.escribemeetings.com/", "SubWhitbyArchive", ""
"https://pub-london.escribemeetings.com/", "LondonArchive", "London Meetings"
"https://pub-middlesexcounty.escribemeetings.com/", "SubMiddlesexCountyArchive", ""
"https://pub-lucanbiddulph.escribemeetings.com/", "SubLucanBiddulphArchive", ""
"https://pub-thamescentre.escribemeetings.com/", "SubThamesCentreArchive", ""
"https://pub-stthomas.escribemeetings.com/", "SubStThomasArchive", ""
"https://pub-northmiddlesex.escribemeetings.com/", "SubNorthMiddlesexArchive", ""
"https://pub-strathroy-caradoc.escribemeetings.com/", "SubStrathroyCaradocArchive", ""
"https://pub-adelaidemetcalfe.escribemeetings.com/", "SubAdelaideMetcalfeArchive", ""
"https://pub-middlesexcentre.escribemeetings.com/", "SubMiddsexCentreArchive", ""
"https://pub-mississauga.escribemeetings.com/", "SubMississaugaArchive", ""
"https://pub-guelph.escribemeetings.com/", "SubGuelphArchive", ""
"https://pub-regionofwaterloo.escribemeetings.com/", "SubWaterlooArchive", ""
"https://pub-kitchener.escribemeetings.com/", "SubKitchenerArchive", ""
"https://pub-hamilton.escribemeetings.com/", "SubHamiltonArchive", ""
"https://pub-brantford.escribemeetings.com/", "SubBrantfordArchive", ""
"https://pub-woodstock.escribemeetings.com/", "SubWoodstockArchive", ""
"https://pub-stratford.escribemeetings.com/", "SubStratfordArchive", ""
"https://pub-chatham-kent.escribemeetings.com/", "SubChathamKentArchive", ""
"https://pub-cambridge.escribemeetings.com/", "SubCambridgeArchive", ""
"https://pub-vaughan.escribemeetings.com/", "SubVaughanArchive", ""
1 https://pub-brampton.escribemeetings.com/ SubBramptonArchive
2 https://pub-markham.escribemeetings.com/ SubMarkhamArchive
3 https://pub-cityofkingston.escribemeetings.com/ SubKingstonArchive
4 https://pub-barrie.escribemeetings.com/ SubBarrieArchive
5 https://pub-oshawa.escribemeetings.com/ SubOshawaArchive
6 https://pub-ottawa.escribemeetings.com/ OttawaArchive
7 https://pub-owensound.escribemeetings.com/ SubOwenSoundArchive
8 https://pub-goderich.escribemeetings.com/ SubGoderichArchive
9 https://pub-oakville.escribemeetings.com/ SubOakvilleArchive
10 https://burlingtonpublishing.escribemeetings.com/ SubBurlingtonArchive
11 https://pub-milton.escribemeetings.com/ SubMiltonArchive
12 https://pub-durhamregion.escribemeetings.com/ SubDurhamArchive
13 https://pub-richmondhill.escribemeetings.com/ SubRichmondHillArchive
14 https://pub-whitby.escribemeetings.com/ SubWhitbyArchive
15 https://pub-london.escribemeetings.com/ LondonArchive London Meetings
16 https://pub-middlesexcounty.escribemeetings.com/ SubMiddlesexCountyArchive
17 https://pub-lucanbiddulph.escribemeetings.com/ SubLucanBiddulphArchive
18 https://pub-thamescentre.escribemeetings.com/ SubThamesCentreArchive
19 https://pub-stthomas.escribemeetings.com/ SubStThomasArchive
20 https://pub-northmiddlesex.escribemeetings.com/ SubNorthMiddlesexArchive
21 https://pub-strathroy-caradoc.escribemeetings.com/ SubStrathroyCaradocArchive
22 https://pub-adelaidemetcalfe.escribemeetings.com/ SubAdelaideMetcalfeArchive
23 https://pub-middlesexcentre.escribemeetings.com/ SubMiddsexCentreArchive
24 https://pub-mississauga.escribemeetings.com/ SubMississaugaArchive
25 https://pub-guelph.escribemeetings.com/ SubGuelphArchive
26 https://pub-regionofwaterloo.escribemeetings.com/ SubWaterlooArchive
27 https://pub-kitchener.escribemeetings.com/ SubKitchenerArchive
28 https://pub-hamilton.escribemeetings.com/ SubHamiltonArchive
29 https://pub-brantford.escribemeetings.com/ SubBrantfordArchive
30 https://pub-woodstock.escribemeetings.com/ SubWoodstockArchive
31 https://pub-stratford.escribemeetings.com/ SubStratfordArchive
32 https://pub-chatham-kent.escribemeetings.com/ SubChathamKentArchive
33 https://pub-cambridge.escribemeetings.com/ SubCambridgeArchive
34 https://pub-vaughan.escribemeetings.com/ SubVaughanArchive