Upload files to "/"
This commit is contained in:
parent
0793fe0036
commit
58c637e051
@ -17,8 +17,7 @@ YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS!
|
||||
|
||||
## Scrape eScribe meetings (SCRAPE_MEET.SH)
|
||||
|
||||
This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST="TRUE"`, meetings older than 2 months will be downloaded. Otherwise, they will be skipped.
|
||||
Don't ask why "TRUE" is a string and not a boolean...
|
||||
This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST=1` (true), meetings older than 2 months will be downloaded. Otherwise, they will be skipped.
|
||||
|
||||
The basic structure of the output files is:
|
||||
```
|
||||
@ -103,4 +102,4 @@ The basic structure of the output files is:
|
||||
|- <attachment 1>.pdf
|
||||
|- <attachment 2>.pdf
|
||||
\- etc etc
|
||||
```
|
||||
```
|
||||
|
||||
@ -38,18 +38,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
|
||||
INDEX_END="FALSE"
|
||||
while [[ $INDEX_END == "FALSE" ]]; do
|
||||
INDEX_END=0
|
||||
while (( ! INDEX_END )); do
|
||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
FOUNDLIST="FALSE"
|
||||
FOUNDLIST=0
|
||||
while IFS= read -r LINE; do
|
||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
||||
if (( FOUNDLIST )); then
|
||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||
if [[ "$GREPENDLIST" == "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: End of list."
|
||||
INDEX_END="TRUE"
|
||||
INDEX_END=1
|
||||
break
|
||||
else
|
||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||
@ -88,11 +88,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||
if [[ "$GREPLIST" != "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||
FOUNDLIST="TRUE"
|
||||
FOUNDLIST=1
|
||||
fi
|
||||
done < $INDEX_PAGE
|
||||
else
|
||||
INDEX_END="TRUE"
|
||||
INDEX_END=1
|
||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||
fi
|
||||
done
|
||||
|
||||
@ -112,11 +112,11 @@ while IFS= read -r LINE_PRE; do
|
||||
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
||||
if [[ "$GREPARTICLESTART" != "" ]]; then
|
||||
echo " FOUND INDEX ARTICLE START"
|
||||
ISARTICLE="TRUE"
|
||||
ISARTICLE=1
|
||||
elif [[ "$GREPARTICLEEND" != "" ]]; then
|
||||
echo " END OF INDEX ARTICLE"
|
||||
ISARTICLE=""
|
||||
elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then
|
||||
ISARTICLE=0
|
||||
elif [[ "$GREPLINK" != "" ]] && (( ISARTICLE )); then
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
|
||||
ISPDF=$(echo $GREPLINK | grep "\.pdf")
|
||||
if [[ "$ISPDF" != "" ]]; then
|
||||
@ -141,14 +141,14 @@ while IFS= read -r LINE_PRE; do
|
||||
# CSS for the HTML is in the default template
|
||||
cat ./template/default.html > ./tmp/new.html
|
||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||
ISATTACHMENTARTICLE="TRUE"
|
||||
ISATTACHMENTARTICLE=1
|
||||
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
|
||||
echo " END OF ATTACHMENT ARTICLE"
|
||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||
echo " PROCESSED TO PDF"
|
||||
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
|
||||
ISATTACHMENTARTICLE=""
|
||||
elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then
|
||||
ISATTACHMENTARTICLE=0
|
||||
elif [[ "$GREPATTACHMENTLINK" != "" ]] && (( ISATTACHMENTARTICLE )); then
|
||||
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
|
||||
if [[ "$ISREFPDF" != "" ]]; then
|
||||
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
|
||||
@ -157,7 +157,7 @@ while IFS= read -r LINE_PRE; do
|
||||
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
|
||||
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
|
||||
fi
|
||||
elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then
|
||||
elif (( ISATTACHMENTARTICLE )); then
|
||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||
fi
|
||||
LINE_ATTACH=""
|
||||
|
||||
@ -198,7 +198,7 @@ current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)00
|
||||
|
||||
SUPPORT_PAST="TRUE"
|
||||
SUPPORT_PAST=1
|
||||
|
||||
if [ -d "$TEMP_DIR" ]; then
|
||||
rm -r $TEMP_DIR
|
||||
@ -214,18 +214,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
|
||||
INDEX_END="FALSE"
|
||||
while [[ $INDEX_END == "FALSE" ]]; do
|
||||
INDEX_END=0
|
||||
while (( ! INDEX_END )); do
|
||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
FOUNDLIST="FALSE"
|
||||
FOUNDLIST=0
|
||||
while IFS= read -r LINE; do
|
||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
||||
if (( FOUNDLIST )); then
|
||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||
if [[ "$GREPENDLIST" == "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: End of list."
|
||||
INDEX_END="TRUE"
|
||||
INDEX_END=1
|
||||
break
|
||||
else
|
||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||
@ -312,11 +312,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
else
|
||||
echo "Dates are in the past!"
|
||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
INPAST="TRUE"
|
||||
INPAST=1
|
||||
fi
|
||||
|
||||
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
||||
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
|
||||
if (( INPAST )) && (( ! SUPPORT_PAST )); then
|
||||
echo "Abort."
|
||||
break
|
||||
fi
|
||||
@ -325,8 +325,8 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
||||
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
||||
|
||||
ERROR="FALSE"
|
||||
ADDENDUM_ERROR="FALSE"
|
||||
ERROR=0
|
||||
ADDENDUM_ERROR=0
|
||||
echo "Downloading agenda HTML..."
|
||||
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
@ -339,16 +339,16 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
else
|
||||
ERROR="TRUE"
|
||||
ERROR=1
|
||||
fi
|
||||
|
||||
if [[ $ADDENDUM_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress
|
||||
else
|
||||
ADDENDUM_ERROR="TRUE"
|
||||
ADDENDUM_ERROR=1
|
||||
fi
|
||||
|
||||
if [[ "$ERROR" == "FALSE" ]]; then
|
||||
if (( ! ERROR )); then
|
||||
|
||||
mkdir "./$CITY_ARCHIVE_NAME"
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
||||
@ -376,7 +376,7 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
|
||||
if (( ! ADDENDUM_ERROR )); then
|
||||
# Get attachment links
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
@ -410,11 +410,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||
if [[ "$GREPLIST" != "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||
FOUNDLIST="TRUE"
|
||||
FOUNDLIST=1
|
||||
fi
|
||||
done < $INDEX_PAGE
|
||||
else
|
||||
INDEX_END="TRUE"
|
||||
INDEX_END=1
|
||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||
fi
|
||||
done
|
||||
|
||||
@ -81,8 +81,8 @@ mkdir $TEMP_DIR
|
||||
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
|
||||
|
||||
j=0
|
||||
SEARCH_END="FALSE"
|
||||
while [[ $SEARCH_END == "FALSE" ]]; do
|
||||
SEARCH_END=0
|
||||
while (( ! SEARCH_END )); do
|
||||
echo "-========================================================================-"
|
||||
echo "Downloading search results... Page $j"
|
||||
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||
@ -124,8 +124,8 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
||||
rm -f $PROJECT_IMAGE_NAMES
|
||||
|
||||
while IFS= read -r PLINE; do
|
||||
if [[ "$NEXT_LINE_FITEM" == "TRUE" ]]; then
|
||||
NEXT_LINE_FITEM="FALSE"
|
||||
if (( NEXT_LINE_FITEM )); then
|
||||
NEXT_LINE_FITEM=0
|
||||
|
||||
# Is this line an actual item?
|
||||
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
|
||||
@ -215,8 +215,8 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$NEXT_LINE_IMAGE" == "TRUE" ]]; then
|
||||
NEXT_LINE_IMAGE="FALSE"
|
||||
if (( NEXT_LINE_IMAGE )); then
|
||||
NEXT_LINE_IMAGE=0
|
||||
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
|
||||
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
|
||||
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
|
||||
@ -243,14 +243,14 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
||||
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
|
||||
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
|
||||
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
|
||||
NEXT_LINE_FITEM="TRUE"
|
||||
NEXT_LINE_FITEM=1
|
||||
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
|
||||
# We're setting a flag to let the script know if an upcoming line is contents.
|
||||
fi
|
||||
|
||||
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
|
||||
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
|
||||
NEXT_LINE_IMAGE="TRUE"
|
||||
NEXT_LINE_IMAGE=1
|
||||
# Same idea as before but for the image shown on the main page.
|
||||
fi
|
||||
|
||||
@ -340,11 +340,11 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
||||
fi
|
||||
done < $SEARCH_PAGE
|
||||
else
|
||||
SEARCH_END="TRUE"
|
||||
SEARCH_END=1
|
||||
echo "No more pages!"
|
||||
fi
|
||||
else
|
||||
SEARCH_END="TRUE"
|
||||
SEARCH_END=1
|
||||
echo "No more pages!"
|
||||
fi
|
||||
((j++))
|
||||
|
||||
35
websites.csv
35
websites.csv
@ -1,34 +1,3 @@
|
||||
"https://pub-brampton.escribemeetings.com/", "SubBramptonArchive", ""
|
||||
"https://pub-markham.escribemeetings.com/", "SubMarkhamArchive", ""
|
||||
"https://pub-cityofkingston.escribemeetings.com/", "SubKingstonArchive", ""
|
||||
"https://pub-barrie.escribemeetings.com/", "SubBarrieArchive", ""
|
||||
"https://pub-oshawa.escribemeetings.com/", "SubOshawaArchive", ""
|
||||
"https://pub-london.escribemeetings.com/", "LondonArchive", ""
|
||||
"https://pub-stthomas.escribemeetings.com/", "StThomasArchive", ""
|
||||
"https://pub-ottawa.escribemeetings.com/", "OttawaArchive", ""
|
||||
"https://pub-owensound.escribemeetings.com/", "SubOwenSoundArchive", ""
|
||||
"https://pub-goderich.escribemeetings.com/", "SubGoderichArchive", ""
|
||||
"https://pub-oakville.escribemeetings.com/", "SubOakvilleArchive", ""
|
||||
"https://burlingtonpublishing.escribemeetings.com/", "SubBurlingtonArchive", ""
|
||||
"https://pub-milton.escribemeetings.com/", "SubMiltonArchive", ""
|
||||
"https://pub-durhamregion.escribemeetings.com/", "SubDurhamArchive", ""
|
||||
"https://pub-richmondhill.escribemeetings.com/", "SubRichmondHillArchive", ""
|
||||
"https://pub-whitby.escribemeetings.com/", "SubWhitbyArchive", ""
|
||||
"https://pub-london.escribemeetings.com/", "LondonArchive", "London Meetings"
|
||||
"https://pub-middlesexcounty.escribemeetings.com/", "SubMiddlesexCountyArchive", ""
|
||||
"https://pub-lucanbiddulph.escribemeetings.com/", "SubLucanBiddulphArchive", ""
|
||||
"https://pub-thamescentre.escribemeetings.com/", "SubThamesCentreArchive", ""
|
||||
"https://pub-stthomas.escribemeetings.com/", "SubStThomasArchive", ""
|
||||
"https://pub-northmiddlesex.escribemeetings.com/", "SubNorthMiddlesexArchive", ""
|
||||
"https://pub-strathroy-caradoc.escribemeetings.com/", "SubStrathroyCaradocArchive", ""
|
||||
"https://pub-adelaidemetcalfe.escribemeetings.com/", "SubAdelaideMetcalfeArchive", ""
|
||||
"https://pub-middlesexcentre.escribemeetings.com/", "SubMiddsexCentreArchive", ""
|
||||
"https://pub-mississauga.escribemeetings.com/", "SubMississaugaArchive", ""
|
||||
"https://pub-guelph.escribemeetings.com/", "SubGuelphArchive", ""
|
||||
"https://pub-regionofwaterloo.escribemeetings.com/", "SubWaterlooArchive", ""
|
||||
"https://pub-kitchener.escribemeetings.com/", "SubKitchenerArchive", ""
|
||||
"https://pub-hamilton.escribemeetings.com/", "SubHamiltonArchive", ""
|
||||
"https://pub-brantford.escribemeetings.com/", "SubBrantfordArchive", ""
|
||||
"https://pub-woodstock.escribemeetings.com/", "SubWoodstockArchive", ""
|
||||
"https://pub-stratford.escribemeetings.com/", "SubStratfordArchive", ""
|
||||
"https://pub-chatham-kent.escribemeetings.com/", "SubChathamKentArchive", ""
|
||||
"https://pub-cambridge.escribemeetings.com/", "SubCambridgeArchive", ""
|
||||
"https://pub-vaughan.escribemeetings.com/", "SubVaughanArchive", ""
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user