Separate functions
This commit is contained in:
parent
3bce46e582
commit
16c4905b41
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
LondonArchive/
|
||||||
|
LondonScrapers_privdata/
|
||||||
|
tmp/
|
||||||
|
staging/
|
||||||
6
README.MD
Normal file → Executable file
6
README.MD
Normal file → Executable file
@ -17,7 +17,7 @@ YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS!
|
|||||||
|
|
||||||
## Scrape eScribe meetings (SCRAPE_MEET.SH)
|
## Scrape eScribe meetings (SCRAPE_MEET.SH)
|
||||||
|
|
||||||
This bash script will scrape meetings from the eScribe meetings platform.
|
This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST=1` (true), meetings older than 2 months will be downloaded. Otherwise, they will be skipped.
|
||||||
|
|
||||||
The basic structure of the output files is:
|
The basic structure of the output files is:
|
||||||
```
|
```
|
||||||
@ -76,7 +76,7 @@ The basic structure of the output files is:
|
|||||||
|
|
||||||
This bash script will scrape LTC meetings from their wordpress site at: https://www.londontransit.ca/agendas-and-minutes/
|
This bash script will scrape LTC meetings from their wordpress site at: https://www.londontransit.ca/agendas-and-minutes/
|
||||||
|
|
||||||
Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low.
|
Attachments are downloaded as the HTML versions, converted to PDF. The original documents (linked from the agenda PDFs) may not always be OCRed, and the quality can be low. The HTML --> PDF conversion needs the template page included at `./template/default.html`.
|
||||||
|
|
||||||
The basic structure of the output files is:
|
The basic structure of the output files is:
|
||||||
```
|
```
|
||||||
@ -102,4 +102,4 @@ The basic structure of the output files is:
|
|||||||
|- <attachment 1>.pdf
|
|- <attachment 1>.pdf
|
||||||
|- <attachment 2>.pdf
|
|- <attachment 2>.pdf
|
||||||
\- etc etc
|
\- etc etc
|
||||||
```
|
```
|
||||||
|
|||||||
66
SCRAPE_AGIS.SH
Executable file
66
SCRAPE_AGIS.SH
Executable file
@ -0,0 +1,66 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
echo -e "\n-========================================================================-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-=- SCRAPE_AGIS.SH: Downloads ArcGIS maps -=-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-=- Lillian Skinner -=-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-========================================================================-"
|
||||||
|
|
||||||
|
source ./functions/.functions
|
||||||
|
|
||||||
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||||
|
|
||||||
|
ARGIS_URL="https://maps.london.ca/server/rest/services"
|
||||||
|
|
||||||
|
TMP="./tmp"
|
||||||
|
TMP_STAGING="./tmp/layers"
|
||||||
|
SERVICELIST_JSON="$TMP/servicelist.json"
|
||||||
|
FOLDER_JSON="$TMP/folder.json"
|
||||||
|
SERVICE_JSON="$TMP/service.json"
|
||||||
|
LAYERQUERY_JSON="$TMP/layer_query.json"
|
||||||
|
|
||||||
|
mkdir "$TMP"
|
||||||
|
mkdir "$TMP_STAGING"
|
||||||
|
|
||||||
|
wget "$ARGIS_URL?f=json" --user-agent="$WGET_UA" -O "$SERVICELIST_JSON" -q
|
||||||
|
|
||||||
|
jq -r '.folders[]?' "$SERVICELIST_JSON" | while read -r FOLDER; do
|
||||||
|
wget "$ARGIS_URL/$FOLDER?f=json" --user-agent="$WGET_UA" -O "$FOLDER_JSON" -q
|
||||||
|
echo "Looking in $FOLDER"
|
||||||
|
jq -r '.services[]
|
||||||
|
| select(.type=="MapServer")
|
||||||
|
| .name' "$FOLDER_JSON" | while read -r SERVICE; do
|
||||||
|
echo "Found $SERVICE"
|
||||||
|
SERVICE_PATH="$FOLDER/$SERVICE"
|
||||||
|
echo "$ARGIS_URL/$SERVICE/MapServer"
|
||||||
|
wget "$ARGIS_URL/$SERVICE/MapServer?f=json" --user-agent="$WGET_UA" -O "$SERVICE_JSON" -q
|
||||||
|
|
||||||
|
mkdir -p "LondonArchive/ArcGIS/${SERVICE}"
|
||||||
|
jq -r '.layers[]? | "\(.id)|\(.name)"' "$SERVICE_JSON" | while IFS='|' read -r LAYERID LAYERNAME; do
|
||||||
|
rm -r "$TMP_STAGING"
|
||||||
|
mkdir "$TMP_STAGING"
|
||||||
|
|
||||||
|
LAYERNAME_CLEAN=$(echo $LAYERNAME | sed 's/\// /g' | sed 's/\\/ /g' | sed -E 's/ {2,}/ /g')
|
||||||
|
|
||||||
|
curl -s "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&returnCountOnly=true&f=json" -o "$TMP/count.json"
|
||||||
|
ITEM_COUNT=$(jq -r '.count' "$TMP/count.json")
|
||||||
|
MAX_REQUESTS=2000
|
||||||
|
i=0
|
||||||
|
j=0
|
||||||
|
|
||||||
|
while (( i <= ITEM_COUNT )); do
|
||||||
|
echo "Downloading $LAYERID-${j} $LAYERNAME_CLEAN"
|
||||||
|
echo "$i of $ITEM_COUNT"
|
||||||
|
|
||||||
|
_utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=geojson" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.geojson"
|
||||||
|
echo "Done GeoJSON!"
|
||||||
|
_utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=kmz" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.kmz"
|
||||||
|
echo "Done KMZ!"
|
||||||
|
i=$(( i + MAX_REQUESTS ))
|
||||||
|
((j++))
|
||||||
|
done
|
||||||
|
7z a "LondonArchive/ArcGIS/${SERVICE}/Layer ${LAYERID} - ${LAYERNAME_CLEAN}.7z" "$TMP_STAGING"
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
14
SCRAPE_ESCRIBE.SH
Normal file → Executable file
14
SCRAPE_ESCRIBE.SH
Normal file → Executable file
@ -38,18 +38,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||||
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||||
|
|
||||||
INDEX_END="FALSE"
|
INDEX_END=0
|
||||||
while [[ $INDEX_END == "FALSE" ]]; do
|
while (( ! INDEX_END )); do
|
||||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
||||||
if [ $? -ne 8 ]; then
|
if [ $? -ne 8 ]; then
|
||||||
FOUNDLIST="FALSE"
|
FOUNDLIST=0
|
||||||
while IFS= read -r LINE; do
|
while IFS= read -r LINE; do
|
||||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
if (( FOUNDLIST )); then
|
||||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||||
if [[ "$GREPENDLIST" == "" ]]; then
|
if [[ "$GREPENDLIST" == "" ]]; then
|
||||||
echo "SCRAPE_ESCRIBE: End of list."
|
echo "SCRAPE_ESCRIBE: End of list."
|
||||||
INDEX_END="TRUE"
|
INDEX_END=1
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||||
@ -88,11 +88,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||||
if [[ "$GREPLIST" != "" ]]; then
|
if [[ "$GREPLIST" != "" ]]; then
|
||||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||||
FOUNDLIST="TRUE"
|
FOUNDLIST=1
|
||||||
fi
|
fi
|
||||||
done < $INDEX_PAGE
|
done < $INDEX_PAGE
|
||||||
else
|
else
|
||||||
INDEX_END="TRUE"
|
INDEX_END=1
|
||||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|||||||
329
SCRAPE_GINV.SH
Executable file
329
SCRAPE_GINV.SH
Executable file
@ -0,0 +1,329 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
echo -e "\n-========================================================================-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-=- Lillian Skinner -=-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-========================================================================-"
|
||||||
|
|
||||||
|
source ./functions/.functions
|
||||||
|
|
||||||
|
# Todo:
|
||||||
|
# - Save updates (see bradley-ave)
|
||||||
|
# - Order, title, and collapse each scraped modal
|
||||||
|
|
||||||
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||||
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||||
|
|
||||||
|
TEMP_DIR="./tmp/"
|
||||||
|
SEARCH_PAGE="./tmp/index_ginv.html"
|
||||||
|
PROJECT_PAGE="./tmp/project_ginv.html"
|
||||||
|
WORK_HTML="./tmp/tmp.html"
|
||||||
|
CUSTOM_HTML="./tmp/custom_ginv.html"
|
||||||
|
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
|
||||||
|
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
|
||||||
|
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
|
||||||
|
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
|
||||||
|
CUSTOM_HTML_TIMELINE="./tmp/custom_timeline_ginv.html"
|
||||||
|
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
|
||||||
|
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
|
||||||
|
FULLDUMP="./tmp/.fulldump.txt"
|
||||||
|
|
||||||
|
current_year=$(date +%Y)
|
||||||
|
current_month=$(date +%m)
|
||||||
|
current_day=$(date +%d)
|
||||||
|
|
||||||
|
rm -f $SEARCH_PAGE
|
||||||
|
|
||||||
|
mkdir $TEMP_DIR
|
||||||
|
|
||||||
|
SEARCH_URL="https://getinvolved.london.ca/projects"
|
||||||
|
|
||||||
|
set_metadata() {
|
||||||
|
tmp=$(echo "$1" | sed 's/&/\&/g' | sed 's/"//g' | sed 's/'/'\''/g' | sed 's/\[/''/g' | sed 's/\]/''/g')
|
||||||
|
PROJECT_NAME=$(_utils_fix_dashes "$(echo $tmp | sed 's/.*data-project-name="\([^"]*\)".*/\1/' | sed 's/‘//g' | sed 's/’//g' | sed 's/'\''//g' | sed 's/://g')")
|
||||||
|
PROJECT_CATS=$(echo "$tmp" | sed 's/.*data-project-category="\([^"]*\)".*/\1/')
|
||||||
|
PROJECT_LOCATION=$(echo "$tmp" | sed 's/.*data-project-location="\([^"]*\)".*/\1/')
|
||||||
|
}
|
||||||
|
|
||||||
|
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||||
|
if [ $? -ne 8 ]; then
|
||||||
|
while IFS= read -r LINE; do
|
||||||
|
|
||||||
|
if (( FOUND_DATE )) && [[ "$LAST_LINE" == "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
|
||||||
|
FOUND_DATE=0
|
||||||
|
echo $PROJECT_URL
|
||||||
|
echo $PROJECT_NAME
|
||||||
|
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
|
||||||
|
# Now we can work on the actual project page.
|
||||||
|
rm -f $CUSTOM_HTML_LINKS
|
||||||
|
rm -f $CUSTOM_HTML_PHOTOS
|
||||||
|
rm -f $CUSTOM_HTML_FAQ
|
||||||
|
rm -f $CUSTOM_HTML_PROFILE
|
||||||
|
rm -f $CUSTOM_HTML_TIMELINE
|
||||||
|
rm -f $CUSTOM_HTML_KEYDATES
|
||||||
|
rm -f $CUSTOM_HTML_SLIDER
|
||||||
|
rm -f $FULLDUMP
|
||||||
|
|
||||||
|
cat ./template/default_getinvolved.html > $CUSTOM_HTML
|
||||||
|
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
|
||||||
|
while IFS= read -r LINE_PROJ; do
|
||||||
|
if (( NEXT_LINE_CONTENT )); then
|
||||||
|
# Next hive-block marks end of current item
|
||||||
|
if [[ "$LINE_PROJ" == *"hive-block"* ]] || [[ "$LINE_PROJ" == "" ]]; then
|
||||||
|
NEXT_LINE_CONTENT=0
|
||||||
|
echo "End of current content."
|
||||||
|
else
|
||||||
|
# Ignore boring notices
|
||||||
|
if [[ "$LINE_PROJ" != *"</h1>"* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *"City of London Land Acknowledgement"* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *"Ongoing Site Specific Planning Applications"* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *"This site is owned and operated by the City of London using software licensed from Social Pinpoint"* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *"Social Pinpoint has been commissioned by City of London (Canada) to collect and display user content on their behalf"* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *"Notice of Collection of Personal Information"* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *'href="/register"'* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *'href="/login"'* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *"Users have the right to access, correct, or delete their personal information"* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *"This privacy policy may change from time to time"* ]] &&
|
||||||
|
#[[ "$LINE_PROJ" != *"Share your feedback"* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *"Notice of Collection"* ]] &&
|
||||||
|
#[[ "$LINE_PROJ" != *"Subscribe for project updates"* ]] &&
|
||||||
|
[[ "$LINE_PROJ" != *"Ready to have your say?"* ]]; then
|
||||||
|
# seds to replace youtube iframe with a normal <a href=""> link. wkhtmltopdf obviously can't embed youtube videos.
|
||||||
|
if (( FIRST_CONTENT )); then
|
||||||
|
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
|
||||||
|
FIRST_CONTENT=0
|
||||||
|
fi
|
||||||
|
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
|
||||||
|
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_DOC_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
|
||||||
|
IS_DOC_BLOCK=0
|
||||||
|
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/download_file/")
|
||||||
|
if [[ "$FOUND_LINK" == "" ]]; then
|
||||||
|
rm -f $CUSTOM_HTML_LINKS
|
||||||
|
fi
|
||||||
|
echo "End of current documents."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_PHOTO_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"<!-- end foreach -->"* ]]; then
|
||||||
|
IS_PHOTO_BLOCK=0
|
||||||
|
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||||||
|
if [[ "$FOUND_LINK" == "" ]]; then
|
||||||
|
rm -f $CUSTOM_HTML_PHOTOS
|
||||||
|
fi
|
||||||
|
echo "End of current photos."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||||
|
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
|
||||||
|
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_FAQ_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
|
||||||
|
IS_FAQ_BLOCK=0
|
||||||
|
echo "End of current FAQ."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||||||
|
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
|
||||||
|
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
|
||||||
|
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
|
||||||
|
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_PROFILE_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"<script>"* ]]; then
|
||||||
|
IS_PROFILE_BLOCK=0
|
||||||
|
echo "End of current profile."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_TIMELINE_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"btn-unfill btn-primary"* ]]; then
|
||||||
|
IS_TIMELINE_BLOCK=0
|
||||||
|
echo "End of current timeline."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_KEYDATES_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
|
||||||
|
IS_KEYDATES_BLOCK=0
|
||||||
|
echo "End of current key dates."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_SLIDER_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
|
||||||
|
IS_SLIDER_BLOCK=0
|
||||||
|
echo "End of current key dates."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_SINGLE_IMAGE_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == "" ]]; then
|
||||||
|
IS_SINGLE_IMAGE_BLOCK=0
|
||||||
|
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||||||
|
if [[ "$FOUND_LINK" == "" ]]; then
|
||||||
|
rm -f $CUSTOM_HTML_PHOTOS
|
||||||
|
else
|
||||||
|
cat "$CUSTOM_HTML_PHOTOS"
|
||||||
|
fi
|
||||||
|
echo "End of current single image."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$LINE_PROJ" == *"hive-block hive-block-content ljs"* ]]; then
|
||||||
|
NEXT_LINE_CONTENT=1
|
||||||
|
FIRST_CONTENT=1
|
||||||
|
# We'll write the LA comment inside of the content block.
|
||||||
|
# There we can ensure that the comment is only written if content does exist.
|
||||||
|
echo "Found content start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"docLibModal hive-block-document-library"* ]]; then
|
||||||
|
IS_DOC_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
|
||||||
|
echo "Found documents start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
|
||||||
|
IS_PHOTO_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
|
||||||
|
echo "Found photos start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"hive-modal faqModal hive-block-faq"* ]]; then
|
||||||
|
IS_FAQ_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
|
||||||
|
echo "Found FAQ start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"hive-block-bio hive-block"* ]]; then
|
||||||
|
IS_PROFILE_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
|
||||||
|
echo "Found profile start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"hive-block-timeline hive-block"* ]]; then
|
||||||
|
IS_TIMELINE_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Timeline -->" >> $FULLDUMP
|
||||||
|
echo "Found timeline start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"hive-modal dateModal"* ]]; then
|
||||||
|
IS_KEYDATES_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
|
||||||
|
echo "Found key dates start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
|
||||||
|
IS_SLIDER_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
|
||||||
|
echo "Found slider start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
|
||||||
|
IS_SINGLE_IMAGE_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
|
||||||
|
echo "Found single image start."
|
||||||
|
fi
|
||||||
|
done < $PROJECT_PAGE
|
||||||
|
|
||||||
|
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
|
||||||
|
#cat "$CUSTOM_HTML_LINKS" >> "$CUSTOM_HTML"
|
||||||
|
|
||||||
|
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
|
||||||
|
|
||||||
|
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
|
||||||
|
while IFS= read -r LINE_DOC; do
|
||||||
|
if [[ "$LINE_DOC" == *"download_file"* ]]; then
|
||||||
|
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||||
|
DOC_NAME=$(curl -s -L -I "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" | grep "location:" | sed 's/location: //' | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//' | sed 's/\.pdf./\.pdf/')
|
||||||
|
echo $DOC_NAME
|
||||||
|
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||||||
|
fi
|
||||||
|
done < $CUSTOM_HTML_LINKS
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
|
||||||
|
while IFS= read -r LINE_DOC; do
|
||||||
|
if [[ "$LINE_DOC" == *"amazonaws"* ]]; then
|
||||||
|
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||||
|
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
|
||||||
|
echo $DOC_NAME
|
||||||
|
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||||||
|
fi
|
||||||
|
done < $CUSTOM_HTML_PHOTOS
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
|
||||||
|
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||||
|
cat ./template/default_getinvolved.html > $WORK_HTML
|
||||||
|
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
|
||||||
|
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
|
||||||
|
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
|
||||||
|
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
|
||||||
|
fi
|
||||||
|
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
|
||||||
|
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
|
||||||
|
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( NEXT_LINE_URL )); then
|
||||||
|
NEXT_LINE_URL=0
|
||||||
|
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$LAST_LINE" != "" ]]; then
|
||||||
|
set_metadata "$LAST_LINE$LINE"
|
||||||
|
LAST_LINE=""
|
||||||
|
NEXT_LINE_URL=1
|
||||||
|
elif [[ "$LINE" == *"h-entry project card"* ]] && [[ "$LINE" == *"data-project-name"* ]] && [[ "$LINE" != *"<%-"* ]]; then
|
||||||
|
#echo $LINE
|
||||||
|
if [[ "$LINE" != *"data-project-category"* ]]; then
|
||||||
|
# Sometimes lines are split, so we'll combine the pieces over time.
|
||||||
|
LAST_LINE=$LINE
|
||||||
|
echo "Line is split!"
|
||||||
|
else
|
||||||
|
LAST_LINE=""
|
||||||
|
set_metadata "$LINE"
|
||||||
|
NEXT_LINE_URL=1
|
||||||
|
fi
|
||||||
|
elif [[ "$LINE" == *'time class="dt-updated"'* ]]; then
|
||||||
|
PROJECT_DATE=$(echo $LINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g')
|
||||||
|
echo $PROJECT_DATE
|
||||||
|
_time_parse_monddyyyy "$PROJECT_DATE"
|
||||||
|
echo "$ITEM_YEAR$ITEM_MONTH$ITEM_DAY"
|
||||||
|
FOUND_DATE=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
done < $SEARCH_PAGE
|
||||||
|
fi
|
||||||
301
SCRAPE_GINV_OLD.SH
Executable file
301
SCRAPE_GINV_OLD.SH
Executable file
@ -0,0 +1,301 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
echo -e "\n-========================================================================-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-=- Lillian Skinner -=-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-========================================================================-"
|
||||||
|
|
||||||
|
source ./functions/.functions
|
||||||
|
|
||||||
|
# Todo:
|
||||||
|
# - Save updates (see bradley-ave)
|
||||||
|
# - Order, title, and collapse each scraped modal
|
||||||
|
|
||||||
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||||
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||||
|
|
||||||
|
TEMP_DIR="./tmp/"
|
||||||
|
SEARCH_PAGE="./tmp/index_ginv.html"
|
||||||
|
PROJECT_PAGE="./tmp/project_ginv.html"
|
||||||
|
WORK_HTML="./tmp/tmp.html"
|
||||||
|
CUSTOM_HTML="./tmp/custom_ginv.html"
|
||||||
|
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
|
||||||
|
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
|
||||||
|
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
|
||||||
|
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
|
||||||
|
CUSTOM_HTML_UPDATE="./tmp/custom_update_ginv.html"
|
||||||
|
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
|
||||||
|
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
|
||||||
|
FULLDUMP="./tmp/.fulldump.txt"
|
||||||
|
|
||||||
|
rm -f $SEARCH_PAGE
|
||||||
|
|
||||||
|
mkdir $TEMP_DIR
|
||||||
|
|
||||||
|
SEARCH_URL="https://getinvolvedlondon.ca.engagementhq.com"
|
||||||
|
|
||||||
|
wget --user-agent="$WGET_UA" "$SEARCH_URL/projects" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||||
|
if [ $? -ne 8 ]; then
|
||||||
|
while IFS= read -r LINE; do
|
||||||
|
|
||||||
|
if [[ "$PROJECT_NAME" != "" ]]; then
|
||||||
|
FOUND_DATE=0
|
||||||
|
echo $PROJECT_URL
|
||||||
|
echo $PROJECT_NAME
|
||||||
|
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
|
||||||
|
# Now we can work on the actual project page.
|
||||||
|
rm -f $CUSTOM_HTML_LINKS
|
||||||
|
rm -f $CUSTOM_HTML_PHOTOS
|
||||||
|
rm -f $CUSTOM_HTML_FAQ
|
||||||
|
rm -f $CUSTOM_HTML_PROFILE
|
||||||
|
rm -f $CUSTOM_HTML_UPDATE
|
||||||
|
rm -f $CUSTOM_HTML_KEYDATES
|
||||||
|
rm -f $CUSTOM_HTML_SLIDER
|
||||||
|
rm -f $FULLDUMP
|
||||||
|
|
||||||
|
cat ./template/default_getinvolved.html > $CUSTOM_HTML
|
||||||
|
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
|
||||||
|
while IFS= read -r LINE_PROJ; do
|
||||||
|
|
||||||
|
if (( IS_DOC_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == "</ul>" ]]; then
|
||||||
|
IS_DOC_BLOCK=0
|
||||||
|
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/documents/")
|
||||||
|
if [[ "$FOUND_LINK" == "" ]]; then
|
||||||
|
rm -f $CUSTOM_HTML_LINKS
|
||||||
|
fi
|
||||||
|
echo "End of current documents."
|
||||||
|
elif [[ "$LINE_PROJ" == *"a data-url"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
echo $LINE_PROJ
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_PHOTO_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == "" ]]; then
|
||||||
|
IS_PHOTO_BLOCK=0
|
||||||
|
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||||||
|
if [[ "$FOUND_LINK" == "" ]]; then
|
||||||
|
rm -f $CUSTOM_HTML_PHOTOS
|
||||||
|
fi
|
||||||
|
echo "End of current photos."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||||
|
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
|
||||||
|
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_FAQ_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"div class='clearfix'></div"* ]]; then
|
||||||
|
IS_FAQ_BLOCK=0
|
||||||
|
echo "End of current FAQ."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||||||
|
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
|
||||||
|
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
|
||||||
|
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
|
||||||
|
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_PROFILE_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"<!--[if IE]>"* ]]; then
|
||||||
|
IS_PROFILE_BLOCK=0
|
||||||
|
echo "End of current profile."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_UPDATE_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"<div class='clearfix'></div>"* ]]; then
|
||||||
|
IS_UPDATE_BLOCK=0
|
||||||
|
echo "End of current update."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_KEYDATES_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == "" ]]; then
|
||||||
|
IS_KEYDATES_BLOCK=0
|
||||||
|
echo "End of current key dates."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_SLIDER_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
|
||||||
|
IS_SLIDER_BLOCK=0
|
||||||
|
echo "End of current key dates."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( IS_SINGLE_IMAGE_BLOCK )); then
|
||||||
|
if [[ "$LINE_PROJ" == "" ]]; then
|
||||||
|
IS_SINGLE_IMAGE_BLOCK=0
|
||||||
|
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||||||
|
if [[ "$FOUND_LINK" == "" ]]; then
|
||||||
|
rm -f $CUSTOM_HTML_PHOTOS
|
||||||
|
else
|
||||||
|
cat "$CUSTOM_HTML_PHOTOS"
|
||||||
|
fi
|
||||||
|
echo "End of current single image."
|
||||||
|
else
|
||||||
|
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
|
||||||
|
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||||
|
echo $LINE_PROJ >> $FULLDUMP
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$LINE_PROJ" == *'div class="full-description hide"'* ]]; then
|
||||||
|
FIRST_CONTENT=1
|
||||||
|
# We'll write the LA comment inside of the content block.
|
||||||
|
# There we can ensure that the comment is only written if content does exist.
|
||||||
|
echo "Found content start."
|
||||||
|
|
||||||
|
if (( FIRST_CONTENT )); then
|
||||||
|
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
|
||||||
|
FIRST_CONTENT=0
|
||||||
|
fi
|
||||||
|
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
|
||||||
|
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
|
||||||
|
|
||||||
|
elif [[ "$LINE_PROJ" == *"widget-wrap widget_document_library"* ]]; then
|
||||||
|
IS_DOC_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
|
||||||
|
echo "Found documents start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
|
||||||
|
IS_PHOTO_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
|
||||||
|
echo "Found photos start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_recent_photos'"* ]]; then
|
||||||
|
IS_FAQ_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
|
||||||
|
echo "Found FAQ start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"widget-wrap widget_project_team"* ]]; then
|
||||||
|
IS_PROFILE_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
|
||||||
|
echo "Found profile start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"<div class='fr-view'>"* ]]; then
|
||||||
|
IS_UPDATE_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Update -->" >> $FULLDUMP
|
||||||
|
echo "<h1>Project Updates</h1>" >> $CUSTOM_HTML_UPDATE
|
||||||
|
echo "Found update start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_life_cycle'"* ]]; then
|
||||||
|
IS_KEYDATES_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
|
||||||
|
echo "Found key dates start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
|
||||||
|
IS_SLIDER_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
|
||||||
|
echo "Found slider start."
|
||||||
|
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
|
||||||
|
IS_SINGLE_IMAGE_BLOCK=1
|
||||||
|
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
|
||||||
|
echo "Found single image start."
|
||||||
|
fi
|
||||||
|
done < $PROJECT_PAGE
|
||||||
|
|
||||||
|
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
|
||||||
|
cat "$CUSTOM_HTML_LINKS" # >> "$CUSTOM_HTML"
|
||||||
|
|
||||||
|
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
|
||||||
|
|
||||||
|
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
|
||||||
|
while IFS= read -r LINE_DOC; do
|
||||||
|
if [[ "$LINE_DOC" == *"/documents/"* ]]; then
|
||||||
|
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||||
|
DOC_NAME="$(echo $LINE_DOC | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/g' | sed 's/ (pdf)//' | sed 's/^ +| +$//g').pdf"
|
||||||
|
echo "-------- "$DOC_NAME
|
||||||
|
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')/download" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||||||
|
fi
|
||||||
|
done < $CUSTOM_HTML_LINKS
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
|
||||||
|
while IFS= read -r LINE_DOC; do
|
||||||
|
if [[ "$LINE_DOC" == *"ehq-production"* ]]; then
|
||||||
|
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||||
|
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
|
||||||
|
echo "======== "$DOC_NAME
|
||||||
|
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||||||
|
fi
|
||||||
|
done < $CUSTOM_HTML_PHOTOS
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
|
||||||
|
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||||
|
cat ./template/default_getinvolved.html > $WORK_HTML
|
||||||
|
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
|
||||||
|
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
|
||||||
|
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
|
||||||
|
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
|
||||||
|
fi
|
||||||
|
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
|
||||||
|
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
|
||||||
|
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
|
||||||
|
|
||||||
|
IS_DOC_BLOCK=0
|
||||||
|
IS_PHOTO_BLOCK=0
|
||||||
|
IS_FAQ_BLOCK=0
|
||||||
|
IS_PROFILE_BLOCK=0
|
||||||
|
IS_UPDATE_BLOCK=0
|
||||||
|
IS_KEYDATES_BLOCK=0
|
||||||
|
IS_SLIDER_BLOCK=0
|
||||||
|
IS_SINGLE_IMAGE_BLOCK=0
|
||||||
|
|
||||||
|
PROJECT_NAME=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( NEXT_LINE_CONT_NAME )); then
|
||||||
|
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LAST_LINE$LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's/‘//g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's/’//g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/"//g' | sed 's/&/and/g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
|
||||||
|
NEXT_LINE_CONT_NAME=0
|
||||||
|
echo $PROJECT_NAME
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$LINE" == *"project-tile__meta__name"* ]]; then
|
||||||
|
if [[ "$LINE" != *"</span"* ]]; then
|
||||||
|
NEXT_LINE_CONT_NAME=1
|
||||||
|
LAST_LINE=$LINE
|
||||||
|
else
|
||||||
|
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's/‘//g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's/’//g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/"//g' | sed 's/&/and/g' | sed 's/'//g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
|
||||||
|
echo $PROJECT_NAME
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$LINE" == *"project-tile__link"* ]]; then
|
||||||
|
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
|
||||||
|
PROJECT_URL=$(echo $SEARCH_URL$PROJECT_URL)
|
||||||
|
echo " "$PROJECT_URL
|
||||||
|
# Reset project name to mark the start of a new project
|
||||||
|
PROJECT_NAME=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
done < $SEARCH_PAGE
|
||||||
|
fi
|
||||||
39
SCRAPE_LPS.SH
Normal file → Executable file
39
SCRAPE_LPS.SH
Normal file → Executable file
@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/env bash
|
#!/bin/bash
|
||||||
echo -e "\n-========================================================================-"
|
echo -e "\n-========================================================================-"
|
||||||
echo -e "-=- -=-"
|
echo -e "-=- -=-"
|
||||||
echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-"
|
echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-"
|
||||||
@ -8,28 +8,7 @@ echo -e "-=- Lillian Skinner
|
|||||||
echo -e "-=- -=-"
|
echo -e "-=- -=-"
|
||||||
echo -e "-========================================================================-"
|
echo -e "-========================================================================-"
|
||||||
|
|
||||||
conv_date() {
|
source ./functions/.functions
|
||||||
echo "$1"
|
|
||||||
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
||||||
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
||||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
|
||||||
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
||||||
case "$MEETING_MONTH_WORD" in
|
|
||||||
Jan*) MEETING_MONTH="01" ;;
|
|
||||||
Feb*) MEETING_MONTH="02" ;;
|
|
||||||
Mar*) MEETING_MONTH="03" ;;
|
|
||||||
Apr*) MEETING_MONTH="04" ;;
|
|
||||||
May) MEETING_MONTH="05" ;;
|
|
||||||
Jun*) MEETING_MONTH="06" ;;
|
|
||||||
Jul*) MEETING_MONTH="07" ;;
|
|
||||||
Aug*) MEETING_MONTH="08" ;;
|
|
||||||
Sep*) MEETING_MONTH="09" ;;
|
|
||||||
Oct*) MEETING_MONTH="10" ;;
|
|
||||||
Nov*) MEETING_MONTH="11" ;;
|
|
||||||
Dec*) MEETING_MONTH="12" ;;
|
|
||||||
*) MEETING_MONTH="--" ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
MEETINGS_PAGE="./tmp.html"
|
MEETINGS_PAGE="./tmp.html"
|
||||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||||
@ -45,9 +24,9 @@ current_year=$(date +%Y)
|
|||||||
current_month=$(date +%m)
|
current_month=$(date +%m)
|
||||||
current_day=$(date +%d)
|
current_day=$(date +%d)
|
||||||
# If I don't set these values then "10#: invalid integer constant"
|
# If I don't set these values then "10#: invalid integer constant"
|
||||||
MEETING_YEAR="0000"
|
ITEM_YEAR="0000"
|
||||||
MEETING_MONTH="00"
|
ITEM_MONTH="00"
|
||||||
MEETING_DAY="00"
|
ITEM_DAY="00"
|
||||||
|
|
||||||
while IFS= read -r LINE_PRE; do
|
while IFS= read -r LINE_PRE; do
|
||||||
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
||||||
@ -66,11 +45,11 @@ while IFS= read -r LINE_PRE; do
|
|||||||
|
|
||||||
FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">')
|
FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">')
|
||||||
if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then
|
if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then
|
||||||
conv_date "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
|
_time_parse_helper "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
|
||||||
echo "$MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
echo "$ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||||
echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')"
|
echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')"
|
||||||
mkdir -p "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/"
|
mkdir -p "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/"
|
||||||
wget "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" -O "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/$ATTACH_TYPE.pdf" -q
|
_utils_download_helper "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/$ATTACH_TYPE.pdf"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
done < "./tmp/index.html"
|
done < "./tmp/index.html"
|
||||||
|
|||||||
69
SCRAPE_LTC.SH
Normal file → Executable file
69
SCRAPE_LTC.SH
Normal file → Executable file
@ -7,6 +7,8 @@ echo -e "-=- Lillian Skinner
|
|||||||
echo -e "-=- -=-"
|
echo -e "-=- -=-"
|
||||||
echo -e "-========================================================================-"
|
echo -e "-========================================================================-"
|
||||||
|
|
||||||
|
source ./functions/.functions
|
||||||
|
|
||||||
MEETINGS_PAGE="./tmp.html"
|
MEETINGS_PAGE="./tmp.html"
|
||||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||||
@ -21,9 +23,9 @@ current_year=$(date +%Y)
|
|||||||
current_month=$(date +%m)
|
current_month=$(date +%m)
|
||||||
current_day=$(date +%d)
|
current_day=$(date +%d)
|
||||||
# If I don't set these values then "10#: invalid integer constant"
|
# If I don't set these values then "10#: invalid integer constant"
|
||||||
MEETING_YEAR="0000"
|
ITEM_YEAR="0000"
|
||||||
MEETING_MONTH="00"
|
ITEM_MONTH="00"
|
||||||
MEETING_DAY="00"
|
ITEM_DAY="00"
|
||||||
|
|
||||||
while IFS= read -r LINE_PRE; do
|
while IFS= read -r LINE_PRE; do
|
||||||
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
||||||
@ -47,33 +49,16 @@ while IFS= read -r LINE_PRE; do
|
|||||||
elif [[ "$GREPDATE" != "" ]]; then
|
elif [[ "$GREPDATE" != "" ]]; then
|
||||||
# Remove HTML junk from date string.
|
# Remove HTML junk from date string.
|
||||||
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
|
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
|
||||||
MEETING_MONTH_WORD=$(echo "$DATES_CLEAN" | sed -E 's/^([A-Za-z]+) .*/\1/')
|
|
||||||
MEETING_DAY_SHORT=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/')
|
|
||||||
MEETING_DAY=$(printf "%02d" ${MEETING_DAY_SHORT#0})
|
|
||||||
MEETING_YEAR=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/')
|
|
||||||
|
|
||||||
case "$MEETING_MONTH_WORD" in
|
_time_parse_helper "$DATES_CLEAN"
|
||||||
Jan*) MEETING_MONTH="01" ;;
|
|
||||||
Feb*) MEETING_MONTH="02" ;;
|
|
||||||
Mar*) MEETING_MONTH="03" ;;
|
|
||||||
Apr*) MEETING_MONTH="04" ;;
|
|
||||||
May) MEETING_MONTH="05" ;;
|
|
||||||
Jun*) MEETING_MONTH="06" ;;
|
|
||||||
Jul*) MEETING_MONTH="07" ;;
|
|
||||||
Aug*) MEETING_MONTH="08" ;;
|
|
||||||
Sep*) MEETING_MONTH="09" ;;
|
|
||||||
Oct*) MEETING_MONTH="10" ;;
|
|
||||||
Nov*) MEETING_MONTH="11" ;;
|
|
||||||
Dec*) MEETING_MONTH="12" ;;
|
|
||||||
*) MEETING_MONTH="--" ;;
|
|
||||||
esac
|
|
||||||
echo " NEW MEETING FOUND"
|
echo " NEW MEETING FOUND"
|
||||||
echo " DATE IS $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
echo " DATE IS $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||||
GREPDATE=""
|
GREPDATE=""
|
||||||
else
|
else
|
||||||
# Has a previous meeting has been set? What about a date?
|
# Has a previous meeting has been set? What about a date?
|
||||||
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
|
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
|
||||||
if [[ "COMMITTEENAME" != "" ]] && [[ "MEETING_YEAR" != "" ]] && (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
|
if [[ "COMMITTEENAME" != "" ]] && [[ "ITEM_YEAR" != "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
|
||||||
# Not changing meetings, and we know that an old meeting has alread been set. Keep going.
|
# Not changing meetings, and we know that an old meeting has alread been set. Keep going.
|
||||||
|
|
||||||
# If match --> make folder --> download
|
# If match --> make folder --> download
|
||||||
@ -85,25 +70,25 @@ while IFS= read -r LINE_PRE; do
|
|||||||
# Well... this aged well.
|
# Well... this aged well.
|
||||||
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
|
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
|
||||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
|
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
|
||||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR" 2> /dev/null
|
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR" 2> /dev/null
|
||||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY" 2> /dev/null
|
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY" 2> /dev/null
|
||||||
if [[ "$AGENDAURL" != "" ]]; then
|
if [[ "$AGENDAURL" != "" ]]; then
|
||||||
echo " DOWNLOAD AGENDA PDF"
|
echo " DOWNLOAD AGENDA PDF"
|
||||||
echo " $AGENDAURL"
|
echo " $AGENDAURL"
|
||||||
wget --user-agent="$WGET_UA" "$AGENDAURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Agenda.pdf" -c -q #--show-progress
|
_utils_download_helper "$AGENDAURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Agenda.pdf"
|
||||||
elif [[ "$MINUTESURL" != "" ]]; then
|
elif [[ "$MINUTESURL" != "" ]]; then
|
||||||
echo " DOWNLOAD MINUTES PDF"
|
echo " DOWNLOAD MINUTES PDF"
|
||||||
echo " $MINUTESURL"
|
echo " $MINUTESURL"
|
||||||
wget --user-agent="$WGET_UA" "$MINUTESURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Minutes.pdf" -c -q #--show-progress
|
_utils_download_helper "$MINUTESURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Minutes.pdf"
|
||||||
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
|
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
|
||||||
if [[ "$AGENDAHTMLURL" != "" ]]; then
|
if [[ "$AGENDAHTMLURL" != "" ]]; then
|
||||||
echo " DOWNLOAD AGENDA HTML TO CRAWL"
|
echo " DOWNLOAD AGENDA HTML TO CRAWL"
|
||||||
echo " $AGENDAHTMLURL"
|
echo " $AGENDAHTMLURL"
|
||||||
wget --user-agent="$WGET_UA" "$AGENDAHTMLURL" -O "./tmp/work.html" -q #--show-progress
|
_utils_download_helper "$AGENDAHTMLURL" "./tmp/work.html"
|
||||||
elif [[ "$MINUTESHTMLURL" != "" ]]; then
|
elif [[ "$MINUTESHTMLURL" != "" ]]; then
|
||||||
echo " DOWNLOAD MINUTES HTML TO CRAWL"
|
echo " DOWNLOAD MINUTES HTML TO CRAWL"
|
||||||
echo " $MINUTESHTMLURL"
|
echo " $MINUTESHTMLURL"
|
||||||
wget --user-agent="$WGET_UA" "$MINUTESHTMLURL" -O "./tmp/work.html" -q #--show-progress
|
_utils_download_helper "$MINUTESHTMLURL" "./tmp/work.html"
|
||||||
fi
|
fi
|
||||||
while IFS= read -r LINE_HTML_PRE; do
|
while IFS= read -r LINE_HTML_PRE; do
|
||||||
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
|
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
|
||||||
@ -112,25 +97,25 @@ while IFS= read -r LINE_PRE; do
|
|||||||
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
||||||
if [[ "$GREPARTICLESTART" != "" ]]; then
|
if [[ "$GREPARTICLESTART" != "" ]]; then
|
||||||
echo " FOUND INDEX ARTICLE START"
|
echo " FOUND INDEX ARTICLE START"
|
||||||
ISARTICLE="TRUE"
|
ISARTICLE=1
|
||||||
elif [[ "$GREPARTICLEEND" != "" ]]; then
|
elif [[ "$GREPARTICLEEND" != "" ]]; then
|
||||||
echo " END OF INDEX ARTICLE"
|
echo " END OF INDEX ARTICLE"
|
||||||
ISARTICLE=""
|
ISARTICLE=0
|
||||||
elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then
|
elif [[ "$GREPLINK" != "" ]] && (( ISARTICLE )); then
|
||||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
|
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments" 2> /dev/null
|
||||||
ISPDF=$(echo $GREPLINK | grep "\.pdf")
|
ISPDF=$(echo $GREPLINK | grep "\.pdf")
|
||||||
if [[ "$ISPDF" != "" ]]; then
|
if [[ "$ISPDF" != "" ]]; then
|
||||||
PDFNAME=$(echo $ISPDF | sed 's/.*\///')
|
PDFNAME=$(echo $ISPDF | sed 's/.*\///')
|
||||||
echo " DOWNLOAD ATTACHMENT PDF"
|
echo " DOWNLOAD ATTACHMENT PDF"
|
||||||
echo " $ISPDF"
|
echo " $ISPDF"
|
||||||
wget --user-agent="$WGET_UA" "$ISPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFNAME" -c -q #--show-progress
|
_utils_download_helper "$ISPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFNAME"
|
||||||
else
|
else
|
||||||
# Extract title of attachment
|
# Extract title of attachment
|
||||||
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
|
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
|
||||||
echo " DOWNLOAD ATTACHMENT HTML"
|
echo " DOWNLOAD ATTACHMENT HTML"
|
||||||
echo " $ATTACHTITLE"
|
echo " $ATTACHTITLE"
|
||||||
echo " $GREPLINK"
|
echo " $GREPLINK"
|
||||||
wget --user-agent="$WGET_UA" "$GREPLINK" -O "./tmp/attachment.html" -q #--show-progress
|
_utils_download_helper "$GREPLINK" "./tmp/attachment.html"
|
||||||
while IFS= read -r LINE_ATTACH_PRE; do
|
while IFS= read -r LINE_ATTACH_PRE; do
|
||||||
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
|
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
|
||||||
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
|
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
|
||||||
@ -141,23 +126,23 @@ while IFS= read -r LINE_PRE; do
|
|||||||
# CSS for the HTML is in the default template
|
# CSS for the HTML is in the default template
|
||||||
cat ./template/default.html > ./tmp/new.html
|
cat ./template/default.html > ./tmp/new.html
|
||||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||||
ISATTACHMENTARTICLE="TRUE"
|
ISATTACHMENTARTICLE=1
|
||||||
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
|
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
|
||||||
echo " END OF ATTACHMENT ARTICLE"
|
echo " END OF ATTACHMENT ARTICLE"
|
||||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||||
echo " PROCESSED TO PDF"
|
echo " PROCESSED TO PDF"
|
||||||
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
|
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
|
||||||
ISATTACHMENTARTICLE=""
|
ISATTACHMENTARTICLE=0
|
||||||
elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then
|
elif [[ "$GREPATTACHMENTLINK" != "" ]] && (( ISATTACHMENTARTICLE )); then
|
||||||
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
|
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
|
||||||
if [[ "$ISREFPDF" != "" ]]; then
|
if [[ "$ISREFPDF" != "" ]]; then
|
||||||
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
|
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
|
||||||
echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
|
echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
|
||||||
echo " $GREPATTACHMENTLINK"
|
echo " $GREPATTACHMENTLINK"
|
||||||
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
|
_utils_download_helper "$ISREFPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFREFNAME"
|
||||||
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
|
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
|
||||||
fi
|
fi
|
||||||
elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then
|
elif (( ISATTACHMENTARTICLE )); then
|
||||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||||
fi
|
fi
|
||||||
LINE_ATTACH=""
|
LINE_ATTACH=""
|
||||||
|
|||||||
532
SCRAPE_MEET.SH
Normal file → Executable file
532
SCRAPE_MEET.SH
Normal file → Executable file
@ -1,185 +1,13 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
echo -e "\n-========================================================================-"
|
echo -e "\n-========================================================================-"
|
||||||
echo -e "-=- -=-"
|
echo -e "-=- -=-"
|
||||||
echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-"
|
echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-"
|
||||||
echo -e "-=- -=-"
|
echo -e "-=- -=-"
|
||||||
echo -e "-=- Lillian Skinner -=-"
|
echo -e "-=- Lillian Skinner -=-"
|
||||||
echo -e "-=- -=-"
|
echo -e "-=- -=-"
|
||||||
echo -e "-========================================================================-"
|
echo -e "-========================================================================-"
|
||||||
|
|
||||||
conv_date() {
|
source ./functions/.functions
|
||||||
echo "$1"
|
|
||||||
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
||||||
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
||||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
|
||||||
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
||||||
case "$MEETING_MONTH_WORD" in
|
|
||||||
Jan*) MEETING_MONTH="01" ;;
|
|
||||||
Feb*) MEETING_MONTH="02" ;;
|
|
||||||
Mar*) MEETING_MONTH="03" ;;
|
|
||||||
Apr*) MEETING_MONTH="04" ;;
|
|
||||||
May) MEETING_MONTH="05" ;;
|
|
||||||
Jun*) MEETING_MONTH="06" ;;
|
|
||||||
Jul*) MEETING_MONTH="07" ;;
|
|
||||||
Aug*) MEETING_MONTH="08" ;;
|
|
||||||
Sep*) MEETING_MONTH="09" ;;
|
|
||||||
Oct*) MEETING_MONTH="10" ;;
|
|
||||||
Nov*) MEETING_MONTH="11" ;;
|
|
||||||
Dec*) MEETING_MONTH="12" ;;
|
|
||||||
*) MEETING_MONTH="--" ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
conv_date_alt() {
|
|
||||||
echo "$1"
|
|
||||||
MEETING_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
|
|
||||||
MEETING_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
|
|
||||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
|
||||||
MEETING_YEAR=$(echo "$1" | sed 's/.* //')
|
|
||||||
case "$MEETING_MONTH_WORD" in
|
|
||||||
Jan*) MEETING_MONTH="01" ;;
|
|
||||||
Feb*) MEETING_MONTH="02" ;;
|
|
||||||
Mar*) MEETING_MONTH="03" ;;
|
|
||||||
Apr*) MEETING_MONTH="04" ;;
|
|
||||||
May) MEETING_MONTH="05" ;;
|
|
||||||
Jun*) MEETING_MONTH="06" ;;
|
|
||||||
Jul*) MEETING_MONTH="07" ;;
|
|
||||||
Aug*) MEETING_MONTH="08" ;;
|
|
||||||
Sep*) MEETING_MONTH="09" ;;
|
|
||||||
Oct*) MEETING_MONTH="10" ;;
|
|
||||||
Nov*) MEETING_MONTH="11" ;;
|
|
||||||
Dec*) MEETING_MONTH="12" ;;
|
|
||||||
*) MEETING_MONTH="--" ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
set_agenda_url() {
|
|
||||||
case "$1" in
|
|
||||||
'"Agenda (HTML)"')
|
|
||||||
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Agenda (PDF)"')
|
|
||||||
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Revised Agenda (HTML)"')
|
|
||||||
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Revised Agenda (PDF)"')
|
|
||||||
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Minutes (HTML)"')
|
|
||||||
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Minutes (PDF)"')
|
|
||||||
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Minutes with Attachments (PDF)"')
|
|
||||||
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
|
|
||||||
'"Agenda Full Package (HTML)"')
|
|
||||||
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Agenda Full Package (PDF)"')
|
|
||||||
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Agenda Cover Page (HTML)"')
|
|
||||||
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Agenda Cover Page (PDF)"')
|
|
||||||
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Post Agenda (HTML)"')
|
|
||||||
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Post Agenda (PDF)"')
|
|
||||||
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Addendum (HTML)"')
|
|
||||||
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
'"Addendum (PDF)"')
|
|
||||||
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
clear_agenda_url() {
|
|
||||||
AGENDA_HTML_URL=""
|
|
||||||
AGENDA_PDF_URL=""
|
|
||||||
AGENDA_REVISE_HTML_URL=""
|
|
||||||
AGENDA_REVISE_PDF_URL=""
|
|
||||||
MINUTES_HTML_URL=""
|
|
||||||
MINUTES_PDF_URL=""
|
|
||||||
MINUTES_ATTACH_PDF_URL=""
|
|
||||||
|
|
||||||
AGENDA_FULL_HTML_URL=""
|
|
||||||
AGENDA_FULL_PDF_URL=""
|
|
||||||
AGENDA_COVER_HTML_URL=""
|
|
||||||
AGENDA_COVER_PDF_URL=""
|
|
||||||
AGENDA_POST_HTML_URL=""
|
|
||||||
AGENDA_POST_PDF_URL=""
|
|
||||||
ADDENDUM_HTML_URL=""
|
|
||||||
ADDENDUM_PDF_URL=""
|
|
||||||
}
|
|
||||||
|
|
||||||
download_agendas() {
|
|
||||||
if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then
|
|
||||||
if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then
|
|
||||||
echo "Saving revised agenda as PDF..."
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$1/Agenda_Revised.pdf" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
if [[ $AGENDA_PDF_URL != "" ]]; then
|
|
||||||
echo "Saving regular agenda as PDF..."
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$1/Agenda.pdf" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
elif [[ $AGENDA_REVISE_HTML_URL != "" ]] || [[ $AGENDA_HTML_URL != "" ]]; then
|
|
||||||
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
|
|
||||||
echo "Saving revised agenda as HTML... (no PDF found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$1/Agenda_Revised.html" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
if [[ $AGENDA_HTML_URL != "" ]]; then
|
|
||||||
echo "Saving regular agenda as HTML... (no PDF found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$1/Agenda.html" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
elif [[ $AGENDA_FULL_PDF_URL != "" ]] || [[ $AGENDA_FULL_HTML_URL != "" ]]; then
|
|
||||||
if [[ $AGENDA_FULL_PDF_URL != "" ]]; then
|
|
||||||
echo "Saving full package agenda as PDF... (no HTML found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_PDF_URL" -O "$1/Agenda_FullPackage.pdf" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
if [[ $AGENDA_FULL_HTML_URL != "" ]]; then
|
|
||||||
echo "Saving full package agenda as HTML... (no PDF found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O "$1/Agenda_FullPackage.html" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
elif [[ $AGENDA_POST_PDF_URL != "" ]] || [[ $AGENDA_POST_HTML_URL != "" ]]; then
|
|
||||||
if [[ $AGENDA_POST_PDF_URL != "" ]]; then
|
|
||||||
echo "Saving post agenda as HTML... (no HTML found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_PDF_URL" -O "$1/Agenda_Post.pdf" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
if [[ $AGENDA_POST_HTML_URL != "" ]]; then
|
|
||||||
echo "Saving post agenda as HTML... (no PDF found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O "$1/Agenda_Post.html" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then
|
|
||||||
if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then
|
|
||||||
echo "Saving minutes with attachments as PDF..."
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$1/Minutes_With_Attachments.pdf" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
if [[ $MINUTES_PDF_URL != "" ]]; then
|
|
||||||
echo "Saving minutes as PDF..."
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$1/Minutes.pdf" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ $MINUTES_HTML_URL != "" ]]; then
|
|
||||||
echo "Saving minutes as HTML... (no PDF found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$1/Minutes.html" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $AGENDA_COVER_PDF_URL != "" ]]; then
|
|
||||||
echo "Saving cover agenda as PDF... (no HTML found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_PDF_URL" -O "$1/Agenda_Cover.pdf" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
if [[ $AGENDA_COVER_HTML_URL != "" ]]; then
|
|
||||||
echo "Saving cover agenda as HTML... (no PDF found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O "$1/Agenda_Cover.html" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
if [[ $ADDENDUM_PDF_URL != "" ]]; then
|
|
||||||
echo "Saving addendum as PDF... (no HTML found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_PDF_URL" -O "$1/Addendum.pdf" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
if [[ $ADDENDUM_HTML_URL != "" ]]; then
|
|
||||||
echo "Saving addendum as HTML... (no PDF found!)"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O "$1/Addendum.html" -N -q #--show-progress
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Warning to all who read this script:
|
# Warning to all who read this script:
|
||||||
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
||||||
@ -196,12 +24,12 @@ ADDENDUM_HTML="./tmp/addendum.html"
|
|||||||
|
|
||||||
current_year=$(date +%Y)
|
current_year=$(date +%Y)
|
||||||
current_month=$(date +%m)
|
current_month=$(date +%m)
|
||||||
current_day=$(date +%d)00
|
current_day=$(date +%d)
|
||||||
|
|
||||||
SUPPORT_PAST="FALSE"
|
SUPPORT_PAST=""
|
||||||
|
|
||||||
if [ -d "$TEMP_DIR" ]; then
|
if [ -d "$TEMP_DIR" ]; then
|
||||||
rm -r $TEMP_DIR
|
rm -r $TEMP_DIR
|
||||||
fi
|
fi
|
||||||
rm -f $INDEX_PAGE
|
rm -f $INDEX_PAGE
|
||||||
rm -f $SEARCH_PAGE
|
rm -f $SEARCH_PAGE
|
||||||
@ -209,215 +37,211 @@ rm -f $AGENDA_HTML
|
|||||||
|
|
||||||
mkdir $TEMP_DIR
|
mkdir $TEMP_DIR
|
||||||
|
|
||||||
while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
while IFS="," read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||||
INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g')
|
INDEX_URL=$(echo "$INDEX_URL_PRE" | sed 's/\"//g' | sed 's/,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||||
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
CITY_ARCHIVE_NAME=$(echo "$CITY_ARCHIVE_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||||
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
CALENDAR_NAME=$(echo "$CALENDAR_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||||
|
INDEX_END="FALSE"
|
||||||
INDEX_END="FALSE"
|
while [[ $INDEX_END == "FALSE" ]]; do
|
||||||
while [[ $INDEX_END == "FALSE" ]]; do
|
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --no-hsts --show-progress
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
if [ $? -ne 8 ]; then
|
||||||
if [ $? -ne 8 ]; then
|
|
||||||
FOUNDLIST="FALSE"
|
FOUNDLIST="FALSE"
|
||||||
while IFS= read -r LINE; do
|
while IFS= read -r LINE; do
|
||||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
if [[ "TRUE" == $FOUNDLIST ]]; then
|
||||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||||
if [[ "$GREPENDLIST" == "" ]]; then
|
if [[ "$GREPENDLIST" == "" ]]; then
|
||||||
echo "SCRAPE_ESCRIBE: End of list."
|
echo "SCRAPE_ESCRIBE: End of list."
|
||||||
INDEX_END="TRUE"
|
INDEX_END="TRUE"
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||||
echo "-========================================================================-"
|
echo "-========================================================================-"
|
||||||
echo "- $MEETING_NAME"
|
echo "- $MEETING_NAME"
|
||||||
|
|
||||||
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
|
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
|
||||||
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
|
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
|
||||||
echo "- Corrected to: $MEETING_NAME"
|
echo "- Corrected to: $MEETING_NAME"
|
||||||
fi
|
fi
|
||||||
# Pages start at 1. Ew.
|
# Pages start at 1. Ew.
|
||||||
x=1
|
x=1
|
||||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . >"${TEMP_DIR}escribe.json"
|
||||||
#cat "${TEMP_DIR}escribe.json" > debug.json
|
#cat "${TEMP_DIR}escribe.json" > debug.json
|
||||||
|
|
||||||
y=0
|
y=0
|
||||||
i=0
|
i=0
|
||||||
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
|
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
|
||||||
while (true); do
|
while (true); do
|
||||||
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length' )
|
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length')
|
||||||
|
|
||||||
if [[ "$NUM_IN_JSON" == "" ]]; then
|
if [[ "$NUM_IN_JSON" == "" ]]; then
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Decrease in the meeting count == we're on the final page.
|
# Decrease in the meeting count == we're on the final page.
|
||||||
if (( $i >= $NUM_IN_JSON )) && (( 10#$NUM_IN_JSON >= 50)); then
|
if (($i >= $NUM_IN_JSON)) && ((10#$NUM_IN_JSON >= 50)); then
|
||||||
((x++))
|
((x++))
|
||||||
i=0
|
i=0
|
||||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . >"${TEMP_DIR}escribe.json"
|
||||||
elif (( $i >= 10#$NUM_IN_JSON )); then
|
elif (($i >= 10#$NUM_IN_JSON)); then
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
|
echo "$(($i + 1)) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
|
||||||
|
|
||||||
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
|
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
|
||||||
# No need to cat the entire file every time.
|
# No need to cat the entire file every time.
|
||||||
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' > "${TEMP_DIR}escribe_short.json"
|
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' >"${TEMP_DIR}escribe_short.json"
|
||||||
|
|
||||||
#echo "> Meeting ID"
|
#echo "> Meeting ID"
|
||||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
|
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
|
||||||
#echo "> Meeting Attachments"
|
#echo "> Meeting Attachments"
|
||||||
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
|
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
|
||||||
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
|
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
|
||||||
|
|
||||||
clear_agenda_url
|
clear_agenda_url
|
||||||
for ((j=0; j<=(( $NUM_ATTACHMENTS - 1 )); j++)); do
|
for ((j = 0; j <= (($NUM_ATTACHMENTS - 1)); j++)); do
|
||||||
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
|
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
|
||||||
done
|
done
|
||||||
|
|
||||||
# "25 Feb 2026"
|
_time_parse_helper "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||||
if [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
|
|
||||||
echo "Alternate date format."
|
|
||||||
conv_date_alt "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
|
||||||
# "Feb 25 2026"
|
|
||||||
elif [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
|
|
||||||
echo "Standard date format."
|
|
||||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
|
||||||
else
|
|
||||||
echo "COULD NOT FIGURE OUT DATE FORMAT!"
|
|
||||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
|
||||||
fi
|
|
||||||
|
|
||||||
INPAST=""
|
INPAST=""
|
||||||
if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
|
if ((10#$ITEM_YEAR >= 10#$current_year)) && ((10#$ITEM_MONTH >= $((10#$current_month - 1)))); then
|
||||||
echo "NAME : $MEETING_NAME"
|
echo "NAME : $MEETING_NAME"
|
||||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
echo "DATE : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||||
echo "A (H) : $AGENDA_HTML_URL"
|
echo "A (H) : $AGENDA_HTML_URL"
|
||||||
echo "A (P) : $AGENDA_PDF_URL"
|
echo "A (P) : $AGENDA_PDF_URL"
|
||||||
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
|
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
|
||||||
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
|
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
|
||||||
echo "AF(H) : $AGENDA_FULL_HTML_URL"
|
echo "AF(H) : $AGENDA_FULL_HTML_URL"
|
||||||
echo "AF(P) : $AGENDA_FULL_PDF_URL"
|
echo "AF(P) : $AGENDA_FULL_PDF_URL"
|
||||||
echo "AC(H) : $AGENDA_COVER_HTML_URL"
|
echo "AC(H) : $AGENDA_COVER_HTML_URL"
|
||||||
echo "AC(P) : $AGENDA_COVER_PDF_URL"
|
echo "AC(P) : $AGENDA_COVER_PDF_URL"
|
||||||
echo "AP(H) : $AGENDA_POST_HTML_URL"
|
echo "AP(H) : $AGENDA_POST_HTML_URL"
|
||||||
echo "AP(P) : $AGENDA_POST_PDF_URL"
|
echo "AP(P) : $AGENDA_POST_PDF_URL"
|
||||||
echo "M (H) : $MINUTES_HTML_URL"
|
echo "M (H) : $MINUTES_HTML_URL"
|
||||||
echo "M (P) : $MINUTES_PDF_URL"
|
echo "M (P) : $MINUTES_PDF_URL"
|
||||||
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
|
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
|
||||||
echo "AD(H) : $ADDENDUM_HTML_URL"
|
echo "AD(H) : $ADDENDUM_HTML_URL"
|
||||||
echo "AD(P) : $ADDENDUM_PDF_URL"
|
echo "AD(P) : $ADDENDUM_PDF_URL"
|
||||||
else
|
else
|
||||||
echo "Dates are in the past!"
|
echo "Dates are in the past!"
|
||||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
echo "DATE : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||||
INPAST="TRUE"
|
INPAST="TRUE"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
||||||
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
|
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
|
||||||
echo "Abort."
|
echo "Abort."
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#echo "> Meeting Video"
|
#echo "> Meeting Video"
|
||||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
||||||
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
||||||
|
|
||||||
ERROR="FALSE"
|
ERROR="FALSE"
|
||||||
ADDENDUM_ERROR="FALSE"
|
ADDENDUM_ERROR="FALSE"
|
||||||
echo "Downloading agenda HTML..."
|
echo "Downloading agenda HTML..."
|
||||||
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
|
||||||
elif [[ $AGENDA_HTML_URL != "" ]]; then
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
|
||||||
elif [[ $AGENDA_FULL_HTML_URL != "" ]]; then
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
|
||||||
elif [[ $AGENDA_POST_HTML_URL != "" ]]; then
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
|
||||||
elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
|
||||||
else
|
|
||||||
ERROR="TRUE"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $ADDENDUM_HTML_URL != "" ]]; then
|
if [[ -n $AGENDA_REVISE_HTML_URL ]]; then
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress
|
_utils_download_helper "$AGENDA_REVISE_HTML_URL" "$AGENDA_HTML"
|
||||||
else
|
elif [[ -n $AGENDA_HTML_URL ]]; then
|
||||||
ADDENDUM_ERROR="TRUE"
|
_utils_download_helper "$AGENDA_HTML_URL" "$AGENDA_HTML"
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$ERROR" == "FALSE" ]]; then
|
elif [[ -n $AGENDA_FULL_HTML_URL ]]; then
|
||||||
|
_utils_download_helper "$AGENDA_FULL_HTML_URL" "$AGENDA_HTML"
|
||||||
|
|
||||||
mkdir "./$CITY_ARCHIVE_NAME"
|
elif [[ -n $AGENDA_POST_HTML_URL ]]; then
|
||||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
_utils_download_helper "$AGENDA_POST_HTML_URL" "$AGENDA_HTML"
|
||||||
|
|
||||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
|
elif [[ -n $AGENDA_COVER_HTML_URL ]]; then
|
||||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
|
_utils_download_helper "$AGENDA_COVER_HTML_URL" "$AGENDA_HTML"
|
||||||
fi
|
else
|
||||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then
|
ERROR="TRUE"
|
||||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR/"
|
fi
|
||||||
fi
|
|
||||||
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY")
|
|
||||||
if [ ! -d "$MEETING_DIR" ]; then
|
|
||||||
mkdir "$MEETING_DIR/"
|
|
||||||
fi
|
|
||||||
if [ ! -d "$MEETING_DIR/Attachments" ]; then
|
|
||||||
mkdir "$MEETING_DIR/Attachments/"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $VIDEO_URL != "" ]]; then
|
if [[ -n $ADDENDUM_HTML_URL ]]; then
|
||||||
echo "Saving recording URL..."
|
_utils_download_helper "$ADDENDUM_HTML_URL" "$ADDENDUM_HTML"
|
||||||
echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt"
|
else
|
||||||
fi
|
ADDENDUM_ERROR="TRUE"
|
||||||
|
fi
|
||||||
|
|
||||||
# Get attachment links
|
if [[ "$ERROR" == "FALSE" ]]; then
|
||||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
|
||||||
# Get attachment names
|
|
||||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
|
||||||
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
|
|
||||||
# Get attachment links
|
|
||||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
|
||||||
# Get attachment names
|
|
||||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
|
||||||
fi
|
|
||||||
# Download attachment and use the name grabbed above
|
|
||||||
echo "Found the following agenda attachments:"
|
|
||||||
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
|
||||||
echo "- $LINEA2"
|
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -N -q #--show-progress
|
|
||||||
done < ./tmp/attachment_urls 3< ./tmp/attachment_names
|
|
||||||
echo "All attachments saved."
|
|
||||||
|
|
||||||
download_agendas "$MEETING_DIR"
|
mkdir "./$CITY_ARCHIVE_NAME"
|
||||||
|
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
||||||
|
|
||||||
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
|
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
|
||||||
echo "dir not empty" >> /dev/null
|
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
|
||||||
else
|
fi
|
||||||
rm -r "$MEETING_DIR/Attachments"
|
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$ITEM_YEAR" ]; then
|
||||||
fi
|
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$ITEM_YEAR/"
|
||||||
|
fi
|
||||||
|
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$ITEM_YEAR" "$ITEM_MONTH" "$ITEM_DAY")
|
||||||
|
if [ ! -d "$MEETING_DIR" ]; then
|
||||||
|
mkdir "$MEETING_DIR/"
|
||||||
|
fi
|
||||||
|
if [ ! -d "$MEETING_DIR/Attachments" ]; then
|
||||||
|
mkdir "$MEETING_DIR/Attachments/"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "All files from this meeting have been saved."
|
if [[ $VIDEO_URL != "" ]]; then
|
||||||
fi
|
echo "Saving recording URL..."
|
||||||
|
echo "https://video.isilive.ca/london/"$VIDEO_URL >"$MEETING_DIR/RecordingLink.txt"
|
||||||
|
fi
|
||||||
|
|
||||||
((i++))
|
# Get attachment links
|
||||||
((y++))
|
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' >"./tmp/attachment_urls"
|
||||||
|
# Get attachment names
|
||||||
|
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' >"./tmp/attachment_names"
|
||||||
|
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
|
||||||
|
# Get attachment links
|
||||||
|
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' >"./tmp/attachment_urls"
|
||||||
|
# Get attachment names
|
||||||
|
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' >"./tmp/attachment_names"
|
||||||
|
fi
|
||||||
|
# Download attachment and use the name grabbed above
|
||||||
|
echo "Found the following agenda attachments:"
|
||||||
|
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
||||||
|
echo "- $LINEA2 / $LINEA1"
|
||||||
|
_utils_download_helper "$INDEX_URL$LINEA1" "$MEETING_DIR/Attachments/$LINEA2"
|
||||||
|
# [ ! -s "$MEETING_DIR/Attachments/$LINEA2" ] && rm -f "$MEETING_DIR/Attachments/$LINEA2"
|
||||||
|
done < ./tmp/attachment_urls 3<./tmp/attachment_names
|
||||||
|
echo "All attachments saved."
|
||||||
|
|
||||||
|
download_agendas "$MEETING_DIR"
|
||||||
|
|
||||||
|
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
|
||||||
|
echo "dir not empty" >>/dev/null
|
||||||
|
else
|
||||||
|
rm -r "$MEETING_DIR/Attachments"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "All files from this meeting have been saved."
|
||||||
|
find "$MEETING_DIR" -type f -size 0 -delete
|
||||||
|
echo "Cleaning PDFs for archive.org..."
|
||||||
|
find "$MEETING_DIR" -type f -name '*.pdf' -print0 | xargs -0 -n1 qpdf --replace-input
|
||||||
|
# qpdf repairs and leaves garbage original PDFs
|
||||||
|
find "$MEETING_DIR" -type f -name '*~qpdf-orig' -delete -print
|
||||||
|
fi
|
||||||
|
|
||||||
|
((i++))
|
||||||
|
((y++))
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||||
if [[ "$GREPLIST" != "" ]]; then
|
if [[ "$GREPLIST" != "" ]]; then
|
||||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||||
FOUNDLIST="TRUE"
|
FOUNDLIST="TRUE"
|
||||||
fi
|
fi
|
||||||
done < $INDEX_PAGE
|
done < $INDEX_PAGE
|
||||||
else
|
else
|
||||||
INDEX_END="TRUE"
|
INDEX_END="TRUE"
|
||||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
done < websites.csv
|
done < websites.csv
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
47
SCRAPE_MPaS.SH
Executable file
47
SCRAPE_MPaS.SH
Executable file
@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
echo -e "\n-========================================================================-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-=- Lillian Skinner -=-"
|
||||||
|
echo -e "-=- -=-"
|
||||||
|
echo -e "-========================================================================-"
|
||||||
|
|
||||||
|
source ./functions/.functions
|
||||||
|
|
||||||
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||||
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||||
|
|
||||||
|
TEMP_DIR="./tmp/"
|
||||||
|
SEARCH_PAGE="./tmp/index_mpas.html"
|
||||||
|
|
||||||
|
rm -f $SEARCH_PAGE
|
||||||
|
|
||||||
|
mkdir $TEMP_DIR
|
||||||
|
|
||||||
|
SEARCH_URL="https://london.ca/government/council-civic-administration/master-plans-strategies/plans-strategies"
|
||||||
|
|
||||||
|
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||||
|
if [ $? -ne 8 ]; then
|
||||||
|
cat "$SEARCH_PAGE" | sed 's/></>\n</g' | \
|
||||||
|
while IFS= read -r LINE; do
|
||||||
|
if (( LAST_LINE_SUMMARY )) && [[ "$LINE" == "<span>"* ]]; then
|
||||||
|
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<span>\([^<]*\)<\/span>.*/\1/')
|
||||||
|
echo $CURRENT
|
||||||
|
fi
|
||||||
|
|
||||||
|
LAST_LINE_SUMMARY=0
|
||||||
|
if echo "$LINE" | grep -q '<summary>'; then
|
||||||
|
LAST_LINE_SUMMARY=1
|
||||||
|
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<summary>\([^<]*\)<\/summary>.*/\1/')
|
||||||
|
echo $CURRENT
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$LINE" == *".pdf"* ]]; then
|
||||||
|
FILE="$(echo "$LINE" | grep -o 'href="[^"]*\.pdf"' | sed 's/^href="//; s/"$//; s#^https://london\.ca##' | sed 's/%20/ /g' | sed 's/%27//g')" # Fix stupid sublime syntax highlighting: '
|
||||||
|
echo $FILE
|
||||||
|
mkdir -p "./LondonArchive/Master Plans and Strategies/$CURRENT/"
|
||||||
|
_utils_download_helper "https://london.ca$FILE" "./LondonArchive/Master Plans and Strategies/$CURRENT/$(basename "$FILE")"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
44
SCRAPE_OPEN.SH
Normal file → Executable file
44
SCRAPE_OPEN.SH
Normal file → Executable file
@ -8,15 +8,19 @@ echo -e "-=- Lillian Skinner
|
|||||||
echo -e "-=- -=-"
|
echo -e "-=- -=-"
|
||||||
echo -e "-========================================================================-"
|
echo -e "-========================================================================-"
|
||||||
|
|
||||||
|
source ./functions/.functions
|
||||||
|
|
||||||
WORKDIR="./tmp"
|
WORKDIR="./tmp"
|
||||||
STAGEDIR="./staging"
|
STAGEDIR="./staging"
|
||||||
DOCDIR="./LondonArchive_OpenData"
|
DOCDIR="./LondonArchive/OpenData"
|
||||||
MAPDIR="./LondonArchive_OpenData/Maps"
|
MAPDIR="./LondonArchive/OpenData/Maps"
|
||||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||||
|
|
||||||
mkdir $WORKDIR
|
mkdir -p $WORKDIR
|
||||||
mkdir $DOCDIR
|
mkdir -p $DOCDIR
|
||||||
mkdir $MAPDIR
|
mkdir -p $MAPDIR
|
||||||
|
|
||||||
|
DOWNLOAD_MAPS=0
|
||||||
|
|
||||||
i=0
|
i=0
|
||||||
SEARCH_END=0
|
SEARCH_END=0
|
||||||
@ -50,18 +54,25 @@ while [[ $SEARCH_END == 0 ]]; do
|
|||||||
echo " Cur. article: $i.$j, URL : $ITEM_URL"
|
echo " Cur. article: $i.$j, URL : $ITEM_URL"
|
||||||
echo " Cur. article: $i.$j, Name : $ITEM_NAME"
|
echo " Cur. article: $i.$j, Name : $ITEM_NAME"
|
||||||
|
|
||||||
rm -rf $STAGEDIR
|
#rm -rf $STAGEDIR
|
||||||
mkdir $STAGEDIR
|
#mkdir $STAGEDIR
|
||||||
|
|
||||||
if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then
|
if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then
|
||||||
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$STAGEDIR/$ITEM_NAME" -c -q
|
_utils_download_helper "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" "$DOCDIR/$ITEM_NAME"
|
||||||
|
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$DOCDIR/$ITEM_NAME" -c -q
|
||||||
echo " Downloaded."
|
echo " Downloaded."
|
||||||
|
|
||||||
echo "Compressing."
|
echo "(Not) Compressing."
|
||||||
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR"
|
# No need to compress non-map data.
|
||||||
|
#7z a "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]]; then
|
# This section is depracated. Use SCRAPE_AGIS.SH instead.
|
||||||
|
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]] && (( DOWNLOAD_MAPS )); then
|
||||||
|
MAPDIR_ITEM=$(echo "$MAPDIR/$ITEM_TITLE")
|
||||||
|
mkdir -p "$MAPDIR_ITEM"
|
||||||
|
echo "Item: $MAPDIR_ITEM"
|
||||||
|
|
||||||
MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')"
|
MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')"
|
||||||
echo " ^^^ Item is map. ($MAP_ID) "
|
echo " ^^^ Item is map. ($MAP_ID) "
|
||||||
# https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1
|
# https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1
|
||||||
@ -71,21 +82,22 @@ while [[ $SEARCH_END == 0 ]]; do
|
|||||||
MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1"
|
MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1"
|
||||||
MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1"
|
MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1"
|
||||||
echo " Map URL (CSV) : $MAP_CSV"
|
echo " Map URL (CSV) : $MAP_CSV"
|
||||||
wget --user-agent="$WGET_UA" "$MAP_CSV" -O "$STAGEDIR/$ITEM_TITLE.csv" -c -q
|
_utils_download_helper "$MAP_CSV" "$MAPDIR_ITEM/$ITEM_TITLE.csv"
|
||||||
echo " Downloaded."
|
echo " Downloaded."
|
||||||
echo " Map URL (Shapefile): $MAP_SHP"
|
echo " Map URL (Shapefile): $MAP_SHP"
|
||||||
wget --user-agent="$WGET_UA" "$MAP_SHP" -O "$STAGEDIR/$ITEM_TITLE.shp" -c -q
|
_utils_download_helper "$MAP_SHP" "$MAPDIR_ITEM/$ITEM_TITLE.shp"
|
||||||
echo " Downloaded."
|
echo " Downloaded."
|
||||||
echo " Map URL (GeoJSON) : $MAP_GEO"
|
echo " Map URL (GeoJSON) : $MAP_GEO"
|
||||||
wget --user-agent="$WGET_UA" "$MAP_GEO" -O "$STAGEDIR/$ITEM_TITLE.geojson" -c -q
|
_utils_download_helper "$MAP_GEO" "$MAPDIR_ITEM/$ITEM_TITLE.geojson"
|
||||||
echo " Downloaded."
|
echo " Downloaded."
|
||||||
echo " Map URL (KML) : $MAP_KML"
|
echo " Map URL (KML) : $MAP_KML"
|
||||||
wget --user-agent="$WGET_UA" "$MAP_KML" -O "$STAGEDIR/$ITEM_TITLE.kml" -c -q
|
_utils_download_helper "$MAP_KML" "$MAPDIR_ITEM/$ITEM_TITLE.kml"
|
||||||
echo " Downloaded."
|
echo " Downloaded."
|
||||||
echo ' Source URL is $ITEM_URL.'
|
echo ' Source URL is $ITEM_URL.'
|
||||||
|
|
||||||
echo "Compressing."
|
echo "Compressing."
|
||||||
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$MAPDIR/$ITEM_TITLE.7z" "$STAGEDIR"
|
rm -f "$MAPDIR_ITEM/$ITEM_TITLE.7z"
|
||||||
|
7z a "$MAPDIR_ITEM/$ITEM_TITLE.7z" "$MAPDIR_ITEM"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|||||||
78
SCRAPE_PLAN.SH
Normal file → Executable file
78
SCRAPE_PLAN.SH
Normal file → Executable file
@ -7,49 +7,7 @@ echo -e "-=- Lillian Skinner
|
|||||||
echo -e "-=- -=-"
|
echo -e "-=- -=-"
|
||||||
echo -e "-========================================================================-"
|
echo -e "-========================================================================-"
|
||||||
|
|
||||||
conv_date_plan() {
|
source ./functions/.functions
|
||||||
PROJECT_TIME_YEAR=$(echo $1 | sed 's/.*\([0-9]\{4\}\).*/\1/p' | uniq)
|
|
||||||
PROJECT_TIME_MONTH_WORD=$(echo $1 | sed 's/.*,\s*\([A-Za-z]*\)\s[0-9]\{1,2\},.*/\1/p' | uniq)
|
|
||||||
PROJECT_TIME_DAY_SHORT=$(echo $1 | sed 's/.*,\s*[A-Za-z]*\s\([0-9]\{1,2\}\),.*/\1/p' | uniq)
|
|
||||||
PROJECT_TIME_DAY=$(printf "%02d" $PROJECT_TIME_DAY_SHORT)
|
|
||||||
case "$PROJECT_TIME_MONTH_WORD" in
|
|
||||||
Jan*) PROJECT_TIME_MONTH="01" ;;
|
|
||||||
Feb*) PROJECT_TIME_MONTH="02" ;;
|
|
||||||
Mar*) PROJECT_TIME_MONTH="03" ;;
|
|
||||||
Apr*) PROJECT_TIME_MONTH="04" ;;
|
|
||||||
May) PROJECT_TIME_MONTH="05" ;;
|
|
||||||
Jun*) PROJECT_TIME_MONTH="06" ;;
|
|
||||||
Jul*) PROJECT_TIME_MONTH="07" ;;
|
|
||||||
Aug*) PROJECT_TIME_MONTH="08" ;;
|
|
||||||
Sep*) PROJECT_TIME_MONTH="09" ;;
|
|
||||||
Oct*) PROJECT_TIME_MONTH="10" ;;
|
|
||||||
Nov*) PROJECT_TIME_MONTH="11" ;;
|
|
||||||
Dec*) PROJECT_TIME_MONTH="12" ;;
|
|
||||||
*) PROJECT_TIME_MONTH="--" ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
conv_date() {
|
|
||||||
MODIFIED_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
||||||
MODIFIED_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
||||||
MODIFIED_DAY=$(printf "%02d" $MODIFIED_DAY_SHORT)
|
|
||||||
MODIFIED_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
||||||
case "$MODIFIED_MONTH_WORD" in
|
|
||||||
Jan*) MODIFIED_MONTH="01" ;;
|
|
||||||
Feb*) MODIFIED_MONTH="02" ;;
|
|
||||||
Mar*) MODIFIED_MONTH="03" ;;
|
|
||||||
Apr*) MODIFIED_MONTH="04" ;;
|
|
||||||
May) MODIFIED_MONTH="05" ;;
|
|
||||||
Jun*) MODIFIED_MONTH="06" ;;
|
|
||||||
Jul*) MODIFIED_MONTH="07" ;;
|
|
||||||
Aug*) MODIFIED_MONTH="08" ;;
|
|
||||||
Sep*) MODIFIED_MONTH="09" ;;
|
|
||||||
Oct*) MODIFIED_MONTH="10" ;;
|
|
||||||
Nov*) MODIFIED_MONTH="11" ;;
|
|
||||||
Dec*) MODIFIED_MONTH="12" ;;
|
|
||||||
*) MODIFIED_MONTH="--" ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
# Warning to all who read this script:
|
# Warning to all who read this script:
|
||||||
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
||||||
@ -81,8 +39,8 @@ mkdir $TEMP_DIR
|
|||||||
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
|
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
|
||||||
|
|
||||||
j=0
|
j=0
|
||||||
SEARCH_END="FALSE"
|
SEARCH_END=0
|
||||||
while [[ $SEARCH_END == "FALSE" ]]; do
|
while (( ! SEARCH_END )); do
|
||||||
echo "-========================================================================-"
|
echo "-========================================================================-"
|
||||||
echo "Downloading search results... Page $j"
|
echo "Downloading search results... Page $j"
|
||||||
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||||
@ -107,11 +65,11 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
|||||||
PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-')
|
PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-')
|
||||||
echo " Found project: $PROJECT_NAME"
|
echo " Found project: $PROJECT_NAME"
|
||||||
|
|
||||||
MODIFIED_MONTH=""
|
ITEM_MONTH=""
|
||||||
MODIFIED_YEAR=""
|
ITEM_YEAR=""
|
||||||
conv_date "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
|
_time_parse_helper "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
|
||||||
if (( 10#$MODIFIED_YEAR >= 10#$current_year )) && (( 10#$MODIFIED_MONTH >= $((10#$current_month - 1)) )); then
|
if (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
|
||||||
echo "Last Modified: $MODIFIED_YEAR/$MODIFIED_MONTH/$MODIFIED_DAY"
|
echo "Last Modified: $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||||
else
|
else
|
||||||
echo "Dates are in the past! Abort."
|
echo "Dates are in the past! Abort."
|
||||||
break
|
break
|
||||||
@ -124,8 +82,8 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
|||||||
rm -f $PROJECT_IMAGE_NAMES
|
rm -f $PROJECT_IMAGE_NAMES
|
||||||
|
|
||||||
while IFS= read -r PLINE; do
|
while IFS= read -r PLINE; do
|
||||||
if [[ "$NEXT_LINE_FITEM" == "TRUE" ]]; then
|
if (( NEXT_LINE_FITEM )); then
|
||||||
NEXT_LINE_FITEM="FALSE"
|
NEXT_LINE_FITEM=0
|
||||||
|
|
||||||
# Is this line an actual item?
|
# Is this line an actual item?
|
||||||
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
|
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
|
||||||
@ -208,15 +166,15 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
|||||||
fi
|
fi
|
||||||
PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime")
|
PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime")
|
||||||
if [[ $PROJECT_FOUND_TIME != "" ]]; then
|
if [[ $PROJECT_FOUND_TIME != "" ]]; then
|
||||||
conv_date_plan "$PLINE"
|
_time_parse_helper "$(echo $PLINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g' | cut -d, -f2- | cut -d\ -f2-)"
|
||||||
echo "Found date : $PROJECT_TIME_YEAR/$PROJECT_TIME_MONTH/$PROJECT_TIME_DAY"
|
echo "Found date : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$NEXT_LINE_IMAGE" == "TRUE" ]]; then
|
if (( NEXT_LINE_IMAGE )); then
|
||||||
NEXT_LINE_IMAGE="FALSE"
|
NEXT_LINE_IMAGE=0
|
||||||
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
|
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
|
||||||
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
|
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
|
||||||
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
|
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
|
||||||
@ -243,14 +201,14 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
|||||||
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
|
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
|
||||||
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
|
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
|
||||||
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
|
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
|
||||||
NEXT_LINE_FITEM="TRUE"
|
NEXT_LINE_FITEM=1
|
||||||
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
|
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
|
||||||
# We're setting a flag to let the script know if an upcoming line is contents.
|
# We're setting a flag to let the script know if an upcoming line is contents.
|
||||||
fi
|
fi
|
||||||
|
|
||||||
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
|
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
|
||||||
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
|
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
|
||||||
NEXT_LINE_IMAGE="TRUE"
|
NEXT_LINE_IMAGE=1
|
||||||
# Same idea as before but for the image shown on the main page.
|
# Same idea as before but for the image shown on the main page.
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -340,11 +298,11 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
|||||||
fi
|
fi
|
||||||
done < $SEARCH_PAGE
|
done < $SEARCH_PAGE
|
||||||
else
|
else
|
||||||
SEARCH_END="TRUE"
|
SEARCH_END=1
|
||||||
echo "No more pages!"
|
echo "No more pages!"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
SEARCH_END="TRUE"
|
SEARCH_END=1
|
||||||
echo "No more pages!"
|
echo "No more pages!"
|
||||||
fi
|
fi
|
||||||
((j++))
|
((j++))
|
||||||
|
|||||||
9
functions/.functions
Normal file
9
functions/.functions
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
sdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
# General
|
||||||
|
source "$sdir/.functions.time"
|
||||||
|
source "$sdir/.functions.utils"
|
||||||
|
|
||||||
|
# Script specific
|
||||||
|
source "$sdir/.functions.filepro"
|
||||||
|
source "$sdir/.functions.escribe"
|
||||||
133
functions/.functions.escribe
Normal file
133
functions/.functions.escribe
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
set_agenda_url() {
|
||||||
|
case "$1" in
|
||||||
|
'"Agenda (HTML)"')
|
||||||
|
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Agenda (PDF)"')
|
||||||
|
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Revised Agenda (HTML)"')
|
||||||
|
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Revised Agenda (PDF)"')
|
||||||
|
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Minutes (HTML)"')
|
||||||
|
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Minutes (PDF)"')
|
||||||
|
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Minutes with Attachments (PDF)"')
|
||||||
|
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
|
||||||
|
'"Agenda Full Package (HTML)"')
|
||||||
|
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Agenda Full Package (PDF)"')
|
||||||
|
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Agenda Cover Page (HTML)"')
|
||||||
|
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Agenda Cover Page (PDF)"')
|
||||||
|
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Post Agenda (HTML)"')
|
||||||
|
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Post Agenda (PDF)"')
|
||||||
|
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Addendum (HTML)"')
|
||||||
|
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
'"Addendum (PDF)"')
|
||||||
|
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
clear_agenda_url() {
|
||||||
|
AGENDA_HTML_URL=""
|
||||||
|
AGENDA_PDF_URL=""
|
||||||
|
AGENDA_REVISE_HTML_URL=""
|
||||||
|
AGENDA_REVISE_PDF_URL=""
|
||||||
|
MINUTES_HTML_URL=""
|
||||||
|
MINUTES_PDF_URL=""
|
||||||
|
MINUTES_ATTACH_PDF_URL=""
|
||||||
|
|
||||||
|
AGENDA_FULL_HTML_URL=""
|
||||||
|
AGENDA_FULL_PDF_URL=""
|
||||||
|
AGENDA_COVER_HTML_URL=""
|
||||||
|
AGENDA_COVER_PDF_URL=""
|
||||||
|
AGENDA_POST_HTML_URL=""
|
||||||
|
AGENDA_POST_PDF_URL=""
|
||||||
|
ADDENDUM_HTML_URL=""
|
||||||
|
ADDENDUM_PDF_URL=""
|
||||||
|
}
|
||||||
|
|
||||||
|
download_agendas() {
|
||||||
|
local outdir="$1"
|
||||||
|
|
||||||
|
if [[ -n $AGENDA_REVISE_PDF_URL ]]; then
|
||||||
|
echo "Saving revised agenda as PDF..."
|
||||||
|
_utils_download_helper "$AGENDA_REVISE_PDF_URL" "$outdir/Agenda_Revised.pdf"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $AGENDA_PDF_URL ]]; then
|
||||||
|
echo "Saving regular agenda as PDF..."
|
||||||
|
_utils_download_helper "$AGENDA_PDF_URL" "$outdir/Agenda.pdf"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z $AGENDA_REVISE_PDF_URL && -n $AGENDA_REVISE_HTML_URL ]]; then
|
||||||
|
echo "Saving revised agenda as HTML... (no PDF found!)"
|
||||||
|
_utils_download_helper "$AGENDA_REVISE_HTML_URL" "$outdir/Agenda_Revised.html"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z $AGENDA_PDF_URL && -n $AGENDA_HTML_URL ]]; then
|
||||||
|
echo "Saving regular agenda as HTML... (no PDF found!)"
|
||||||
|
_utils_download_helper "$AGENDA_HTML_URL" "$outdir/Agenda.html"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $AGENDA_FULL_PDF_URL ]]; then
|
||||||
|
echo "Saving full package agenda as PDF... (no HTML found!)"
|
||||||
|
_utils_download_helper "$AGENDA_FULL_PDF_URL" "$outdir/Agenda_FullPackage.pdf"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z $AGENDA_FULL_PDF_URL && -n $AGENDA_FULL_HTML_URL ]]; then
|
||||||
|
echo "Saving full package agenda as HTML... (no PDF found!)"
|
||||||
|
_utils_download_helper "$AGENDA_FULL_HTML_URL" "$outdir/Agenda_FullPackage.html"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $AGENDA_POST_PDF_URL ]]; then
|
||||||
|
echo "Saving post agenda as PDF..."
|
||||||
|
_utils_download_helper "$AGENDA_POST_PDF_URL" "$outdir/Agenda_Post.pdf"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z $AGENDA_POST_PDF_URL && -n $AGENDA_POST_HTML_URL ]]; then
|
||||||
|
echo "Saving post agenda as HTML... (no PDF found!)"
|
||||||
|
_utils_download_helper "$AGENDA_POST_HTML_URL" "$outdir/Agenda_Post.html"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $MINUTES_ATTACH_PDF_URL ]]; then
|
||||||
|
echo "Saving minutes with attachments as PDF..."
|
||||||
|
_utils_download_helper "$MINUTES_ATTACH_PDF_URL" "$outdir/Minutes_With_Attachments.pdf"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $MINUTES_PDF_URL ]]; then
|
||||||
|
echo "Saving minutes as PDF..."
|
||||||
|
_utils_download_helper "$MINUTES_PDF_URL" "$outdir/Minutes.pdf"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z $MINUTES_ATTACH_PDF_URL && -z $MINUTES_PDF_URL && -n $MINUTES_HTML_URL ]]; then
|
||||||
|
echo "Saving minutes as HTML... (no PDF found!)"
|
||||||
|
_utils_download_helper "$MINUTES_HTML_URL" "$outdir/Minutes.html"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $AGENDA_COVER_PDF_URL ]]; then
|
||||||
|
echo "Saving cover agenda as PDF... (no HTML found!)"
|
||||||
|
_utils_download_helper "$AGENDA_COVER_PDF_URL" "$outdir/Agenda_Cover.pdf"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z $AGENDA_COVER_PDF_URL && -n $AGENDA_COVER_HTML_URL ]]; then
|
||||||
|
echo "Saving cover agenda as HTML... (no PDF found!)"
|
||||||
|
_utils_download_helper "$AGENDA_COVER_HTML_URL" "$outdir/Agenda_Cover.html"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n $ADDENDUM_PDF_URL ]]; then
|
||||||
|
echo "Saving addendum as PDF... (no HTML found!)"
|
||||||
|
_utils_download_helper "$ADDENDUM_PDF_URL" "$outdir/Addendum.pdf"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z $ADDENDUM_PDF_URL && -n $ADDENDUM_HTML_URL ]]; then
|
||||||
|
echo "Saving addendum as HTML... (no PDF found!)"
|
||||||
|
_utils_download_helper "$ADDENDUM_HTML_URL" "$outdir/Addendum.html"
|
||||||
|
fi
|
||||||
|
}
|
||||||
34
functions/.functions.filepro
Normal file
34
functions/.functions.filepro
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
_filepro_download_folder() {
|
||||||
|
if [ "$#" -eq 0 ]; then
|
||||||
|
echo "Usage: <input string>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local tmp_index
|
||||||
|
tmp_index=$(mktemp)
|
||||||
|
local tmp_dir
|
||||||
|
tmp_dir="$1"
|
||||||
|
|
||||||
|
local LINE
|
||||||
|
local LINE_ID
|
||||||
|
local LINE_TITLE
|
||||||
|
local LINE_TYPE
|
||||||
|
|
||||||
|
wget --no-check-certificate --user-agent="$WGET_UA" "$2" -O "$tmp_index" --no-hsts -q
|
||||||
|
echo "Looking in folder $3/$LINE_ID"
|
||||||
|
echo "Download to $tmp_dir/"
|
||||||
|
while IFS= read -r LINE; do
|
||||||
|
LINE_ID=$(echo $LINE | sed 's/.*data-id="\([^"]*\)".*/\1/g')
|
||||||
|
LINE_TITLE=$(echo $LINE | sed 's/.*data-title="\([^"]*\)".*/\1/g' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/'/'\''/g')
|
||||||
|
LINE_TYPE=$(echo $LINE | sed 's/.*data-type="\([^"]*\)".*/\1/g')
|
||||||
|
if [[ "$LINE_TYPE" == "document" ]]; then
|
||||||
|
echo "Found document: $LINE_ID : $LINE_TITLE.pdf... downloading..."
|
||||||
|
mkdir -p "$tmp_dir"
|
||||||
|
_utils_download_helper "${START_URL}/document/$LINE_ID" "$tmp_dir/$LINE_TITLE.pdf"
|
||||||
|
elif [[ "$LINE_TYPE" == "folder" ]]; then
|
||||||
|
_filepro_download_folder "$tmp_dir/$LINE_TITLE" "${START_URL}/filepro/documents/$LINE_ID" "$3/$LINE_ID"
|
||||||
|
fi
|
||||||
|
done < "$tmp_index"
|
||||||
|
|
||||||
|
rm -f $tmp_index
|
||||||
|
}
|
||||||
71
functions/.functions.time
Normal file
71
functions/.functions.time
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
_time_parse_helper() {
|
||||||
|
if [ "$#" -eq 0 ]; then
|
||||||
|
echo "Usage: <date>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo $1
|
||||||
|
|
||||||
|
if [[ "$(echo $1 | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
|
||||||
|
_time_parse_ddmonyyyy "$1"
|
||||||
|
elif [[ "$(echo $1 | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
|
||||||
|
_time_parse_monddyyyy "$1"
|
||||||
|
else
|
||||||
|
echo "COULD NOT FIGURE OUT DATE FORMAT!"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
_time_parse_monddyyyy() {
|
||||||
|
if [ "$#" -eq 0 ]; then
|
||||||
|
echo "Usage: <date in mon dd yyyy>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ITEM_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||||
|
ITEM_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||||
|
ITEM_DAY=$(printf "%02d" $ITEM_DAY_SHORT)
|
||||||
|
ITEM_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||||
|
case "$ITEM_MONTH_WORD" in
|
||||||
|
Jan*) ITEM_MONTH="01" ;;
|
||||||
|
Feb*) ITEM_MONTH="02" ;;
|
||||||
|
Mar*) ITEM_MONTH="03" ;;
|
||||||
|
Apr*) ITEM_MONTH="04" ;;
|
||||||
|
May) ITEM_MONTH="05" ;;
|
||||||
|
Jun*) ITEM_MONTH="06" ;;
|
||||||
|
Jul*) ITEM_MONTH="07" ;;
|
||||||
|
Aug*) ITEM_MONTH="08" ;;
|
||||||
|
Sep*) ITEM_MONTH="09" ;;
|
||||||
|
Oct*) ITEM_MONTH="10" ;;
|
||||||
|
Nov*) ITEM_MONTH="11" ;;
|
||||||
|
Dec*) ITEM_MONTH="12" ;;
|
||||||
|
*) ITEM_MONTH="--" ;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
_time_parse_ddmonyyyy() {
|
||||||
|
if [ "$#" -eq 0 ]; then
|
||||||
|
echo "Usage: <date in dd mon yyyy>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ITEM_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
|
||||||
|
ITEM_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
|
||||||
|
ITEM_DAY=$(printf "%02d" $ITEM_DAY_SHORT)
|
||||||
|
ITEM_YEAR=$(echo "$1" | sed 's/.* //')
|
||||||
|
case "$ITEM_MONTH_WORD" in
|
||||||
|
Jan*) ITEM_MONTH="01" ;;
|
||||||
|
Feb*) ITEM_MONTH="02" ;;
|
||||||
|
Mar*) ITEM_MONTH="03" ;;
|
||||||
|
Apr*) ITEM_MONTH="04" ;;
|
||||||
|
May) ITEM_MONTH="05" ;;
|
||||||
|
Jun*) ITEM_MONTH="06" ;;
|
||||||
|
Jul*) ITEM_MONTH="07" ;;
|
||||||
|
Aug*) ITEM_MONTH="08" ;;
|
||||||
|
Sep*) ITEM_MONTH="09" ;;
|
||||||
|
Oct*) ITEM_MONTH="10" ;;
|
||||||
|
Nov*) ITEM_MONTH="11" ;;
|
||||||
|
Dec*) ITEM_MONTH="12" ;;
|
||||||
|
*) ITEM_MONTH="--" ;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
104
functions/.functions.utils
Normal file
104
functions/.functions.utils
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
_utils_ocrmypdf() {
|
||||||
|
if [ "$#" -eq 0 ]; then
|
||||||
|
echo "Usage: <in.pdf> <out.pdf>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# https://stackoverflow.com/questions/7997399/bash-script-to-check-pdfs-are-ocrd
|
||||||
|
MYFONTS=$(pdffonts -l 5 "$1" | tail -n +3 | cut -d' ' -f1 | sort | uniq)
|
||||||
|
if [ "$MYFONTS" = '' ] || [ "$MYFONTS" = '[none]' ]; then
|
||||||
|
echo "NOT OCRed yet. Working..."
|
||||||
|
else
|
||||||
|
echo "$1 is already OCRed. Saving as is."
|
||||||
|
cp "$1" "$2"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
in="$1"
|
||||||
|
out="$2"
|
||||||
|
tmp=$(mktemp -d) || return 1
|
||||||
|
|
||||||
|
pdfseparate "$in" "$tmp/page-%04d.pdf" || return 1
|
||||||
|
i=0
|
||||||
|
for page in "$tmp"/page-*.pdf; do
|
||||||
|
img="$tmp/img-$i.png"
|
||||||
|
qpdf --replace-input --rotate=0:1-z "$page"
|
||||||
|
pdftoppm -singlefile -r 300 -png -cropbox "$page" "$tmp/img-$i" || return 1
|
||||||
|
# Checks rotations. Annoying way to do it but whatever.
|
||||||
|
rotation=$(tesseract "$img" stdout --psm 0 2>/dev/null | awk -F': ' '/Rotate/ {print $2}')
|
||||||
|
case "$rotation" in
|
||||||
|
180) convert "$img" -rotate 180 "$img" ;;
|
||||||
|
90) convert "$img" -rotate 90 "$img" ;;
|
||||||
|
270) convert "$img" -rotate 270 "$img";;
|
||||||
|
esac
|
||||||
|
ocrmypdf \
|
||||||
|
--skip-text \
|
||||||
|
--clean \
|
||||||
|
--optimize 1 \
|
||||||
|
--jobs 1 \
|
||||||
|
"$img" "$tmp/ocr-$i-tmp.pdf" || return 1
|
||||||
|
|
||||||
|
case "$rotation" in
|
||||||
|
90) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
|
||||||
|
270) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
|
||||||
|
esac
|
||||||
|
mv "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf"
|
||||||
|
|
||||||
|
i=$((i+1))
|
||||||
|
done
|
||||||
|
|
||||||
|
pdfunite $(ls -v ${tmp}/ocr*.pdf) "$out" || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
_utils_fix_dashes() {
|
||||||
|
if [ "$#" -eq 0 ]; then
|
||||||
|
echo "Usage: <input string>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
perl -CSDA -MURI::Escape -MUnicode::Normalize -e '
|
||||||
|
binmode STDOUT, ":utf8";
|
||||||
|
my $s = shift // "";
|
||||||
|
my $prev;
|
||||||
|
do { $prev = $s; $s = uri_unescape($s); } while ($s ne $prev);
|
||||||
|
$s = NFKC($s);
|
||||||
|
$s =~ tr/\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}/-/;
|
||||||
|
$s =~ s/&/and/g;
|
||||||
|
$s =~ tr/\x{2018}\x{2019}\x{201B}/\x27/;
|
||||||
|
$s =~ tr/\x{201C}\x{201D}/"/;
|
||||||
|
$s =~ tr/\x{00A0}/ /;
|
||||||
|
$s =~ s/[\x{200B}\x{200C}\x{200D}\x{FEFF}]//g;
|
||||||
|
$s =~ s/\s+/ /g;
|
||||||
|
$s =~ s/^\s+|\s+$//g;
|
||||||
|
$s =~ s/\s+(\.[^. ]+)$/$1/;
|
||||||
|
print $s;
|
||||||
|
' "$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
_utils_download_helper() {
|
||||||
|
if [ "$#" -eq 0 ]; then
|
||||||
|
echo "Usage: <url> <outfile>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||||
|
|
||||||
|
local url="$1"
|
||||||
|
local out="$2"
|
||||||
|
local code
|
||||||
|
|
||||||
|
code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
|
||||||
|
case "$code" in
|
||||||
|
200)
|
||||||
|
echo "Downloaded."
|
||||||
|
;;
|
||||||
|
304)
|
||||||
|
echo "Already exists! Skipping."
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "FAILED! $code: $out | $url" >&2
|
||||||
|
rm -f "$out"
|
||||||
|
return 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
78
template/default.html
Normal file
78
template/default.html
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
<meta charset="UTF-8">
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
width: 90%;
|
||||||
|
min-width: 600px;
|
||||||
|
position: relative;
|
||||||
|
margin-left: auto;
|
||||||
|
margin-right: auto;
|
||||||
|
color: #666;
|
||||||
|
font-size: 16px;
|
||||||
|
font-family: Frutiger,"Helvetica Neue",Helvetica,Arial,sans-serif;
|
||||||
|
font-weight: 300;
|
||||||
|
}
|
||||||
|
strong {
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
p {
|
||||||
|
color: #000000;
|
||||||
|
}
|
||||||
|
.h1, .h2, .h3, .h4, .h5, .h6, .post-teaser.featured .post-title, h1, h2, h3, h4, h5, h6 {
|
||||||
|
font-family: Gnuolane,"Helvetica Neue",Helvetica,Arial,sans-serif;
|
||||||
|
font-weight: 700;
|
||||||
|
line-height: 1.1;
|
||||||
|
color: #087ac0;
|
||||||
|
}
|
||||||
|
|
||||||
|
table {
|
||||||
|
display: table;
|
||||||
|
margin-bottom: 2em;
|
||||||
|
min-width: 100%;
|
||||||
|
border-spacing: 0;
|
||||||
|
border-collapse: collapse;
|
||||||
|
border-color: #ccc;
|
||||||
|
background-color: transparent;
|
||||||
|
line-height: 1.5;
|
||||||
|
}
|
||||||
|
.table-responsive {
|
||||||
|
overflow-x: auto;
|
||||||
|
}
|
||||||
|
table tbody {
|
||||||
|
display: table-row-group;
|
||||||
|
vertical-align: top;
|
||||||
|
border-color: inherit;
|
||||||
|
}
|
||||||
|
table tbody > tr:nth-of-type(2n+1) {
|
||||||
|
background-color: #f2f2f2;
|
||||||
|
}
|
||||||
|
table tr {
|
||||||
|
display: table-row;
|
||||||
|
vertical-align: inherit;
|
||||||
|
border-color: inherit;
|
||||||
|
}
|
||||||
|
table tbody > tr > td, table tbody > tr > th, table thead > tr > th {
|
||||||
|
padding: 8px;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
vertical-align: top;
|
||||||
|
}
|
||||||
|
table td {
|
||||||
|
display: table-cell;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
}
|
||||||
|
ol, ul {
|
||||||
|
margin-top: 0;
|
||||||
|
margin-bottom: 12px;
|
||||||
|
}
|
||||||
|
ol, ul {
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
ol li {
|
||||||
|
padding-left: 10px;
|
||||||
|
}
|
||||||
|
ol li, ul li {
|
||||||
|
padding-bottom: 12px;
|
||||||
|
}
|
||||||
|
address, dd, dt, li, p {
|
||||||
|
line-height: 1.5;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
83
template/default_getinvolved.html
Normal file
83
template/default_getinvolved.html
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
<meta charset="UTF-8">
|
||||||
|
<style>
|
||||||
|
td, h1, h2, h3, p, b, div, i, span, label, ul, li, tr, table { page-break-inside: avoid; }
|
||||||
|
body {
|
||||||
|
width: 90%;
|
||||||
|
min-width: 600px;
|
||||||
|
position: relative;
|
||||||
|
margin-left: auto;
|
||||||
|
margin-right: auto;
|
||||||
|
color: #666;
|
||||||
|
font-size: 16px;
|
||||||
|
font-family: Frutiger,"Helvetica Neue",Helvetica,Arial,sans-serif;
|
||||||
|
font-weight: 300;
|
||||||
|
}
|
||||||
|
strong {
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
p {
|
||||||
|
color: #000000;
|
||||||
|
}
|
||||||
|
.h1, .h2, .h3, .h4, .h5, .h6, .post-teaser.featured .post-title, h1, h2, h3, h4, h5, h6 {
|
||||||
|
font-family: Gnuolane,"Helvetica Neue",Helvetica,Arial,sans-serif;
|
||||||
|
font-weight: 700;
|
||||||
|
line-height: 1.1;
|
||||||
|
color: #087ac0;
|
||||||
|
}
|
||||||
|
|
||||||
|
table {
|
||||||
|
display: table;
|
||||||
|
margin-bottom: 2em;
|
||||||
|
min-width: 100%;
|
||||||
|
border-spacing: 0;
|
||||||
|
border-collapse: collapse;
|
||||||
|
border-color: #ccc;
|
||||||
|
background-color: transparent;
|
||||||
|
line-height: 1.5;
|
||||||
|
}
|
||||||
|
.table-responsive {
|
||||||
|
overflow-x: auto;
|
||||||
|
}
|
||||||
|
table tbody {
|
||||||
|
display: table-row-group;
|
||||||
|
vertical-align: top;
|
||||||
|
border-color: inherit;
|
||||||
|
}
|
||||||
|
table tbody > tr:nth-of-type(2n+1) {
|
||||||
|
background-color: #f2f2f2;
|
||||||
|
}
|
||||||
|
table tr {
|
||||||
|
display: table-row;
|
||||||
|
vertical-align: inherit;
|
||||||
|
border-color: inherit;
|
||||||
|
}
|
||||||
|
table tbody > tr > td, table tbody > tr > th, table thead > tr > th {
|
||||||
|
padding: 8px;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
vertical-align: top;
|
||||||
|
}
|
||||||
|
table td {
|
||||||
|
display: table-cell;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
}
|
||||||
|
ol, ul {
|
||||||
|
margin-top: 0;
|
||||||
|
margin-bottom: 12px;
|
||||||
|
}
|
||||||
|
ol, ul {
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
ol li {
|
||||||
|
padding-left: 10px;
|
||||||
|
}
|
||||||
|
ol li, ul li {
|
||||||
|
padding-bottom: 12px;
|
||||||
|
}
|
||||||
|
address, dd, dt, li, p {
|
||||||
|
line-height: 1.5;
|
||||||
|
}
|
||||||
|
img {
|
||||||
|
max-width: 100% !important;
|
||||||
|
height: auto !important;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
35
websites.csv
Normal file → Executable file
35
websites.csv
Normal file → Executable file
@ -1,34 +1,3 @@
|
|||||||
"https://pub-brampton.escribemeetings.com/", "SubBramptonArchive", ""
|
"https://pub-london.escribemeetings.com/", "LondonArchive", ""
|
||||||
"https://pub-markham.escribemeetings.com/", "SubMarkhamArchive", ""
|
"https://pub-stthomas.escribemeetings.com/", "StThomasArchive", ""
|
||||||
"https://pub-cityofkingston.escribemeetings.com/", "SubKingstonArchive", ""
|
|
||||||
"https://pub-barrie.escribemeetings.com/", "SubBarrieArchive", ""
|
|
||||||
"https://pub-oshawa.escribemeetings.com/", "SubOshawaArchive", ""
|
|
||||||
"https://pub-ottawa.escribemeetings.com/", "OttawaArchive", ""
|
"https://pub-ottawa.escribemeetings.com/", "OttawaArchive", ""
|
||||||
"https://pub-owensound.escribemeetings.com/", "SubOwenSoundArchive", ""
|
|
||||||
"https://pub-goderich.escribemeetings.com/", "SubGoderichArchive", ""
|
|
||||||
"https://pub-oakville.escribemeetings.com/", "SubOakvilleArchive", ""
|
|
||||||
"https://burlingtonpublishing.escribemeetings.com/", "SubBurlingtonArchive", ""
|
|
||||||
"https://pub-milton.escribemeetings.com/", "SubMiltonArchive", ""
|
|
||||||
"https://pub-durhamregion.escribemeetings.com/", "SubDurhamArchive", ""
|
|
||||||
"https://pub-richmondhill.escribemeetings.com/", "SubRichmondHillArchive", ""
|
|
||||||
"https://pub-whitby.escribemeetings.com/", "SubWhitbyArchive", ""
|
|
||||||
"https://pub-london.escribemeetings.com/", "LondonArchive", "London Meetings"
|
|
||||||
"https://pub-middlesexcounty.escribemeetings.com/", "SubMiddlesexCountyArchive", ""
|
|
||||||
"https://pub-lucanbiddulph.escribemeetings.com/", "SubLucanBiddulphArchive", ""
|
|
||||||
"https://pub-thamescentre.escribemeetings.com/", "SubThamesCentreArchive", ""
|
|
||||||
"https://pub-stthomas.escribemeetings.com/", "SubStThomasArchive", ""
|
|
||||||
"https://pub-northmiddlesex.escribemeetings.com/", "SubNorthMiddlesexArchive", ""
|
|
||||||
"https://pub-strathroy-caradoc.escribemeetings.com/", "SubStrathroyCaradocArchive", ""
|
|
||||||
"https://pub-adelaidemetcalfe.escribemeetings.com/", "SubAdelaideMetcalfeArchive", ""
|
|
||||||
"https://pub-middlesexcentre.escribemeetings.com/", "SubMiddsexCentreArchive", ""
|
|
||||||
"https://pub-mississauga.escribemeetings.com/", "SubMississaugaArchive", ""
|
|
||||||
"https://pub-guelph.escribemeetings.com/", "SubGuelphArchive", ""
|
|
||||||
"https://pub-regionofwaterloo.escribemeetings.com/", "SubWaterlooArchive", ""
|
|
||||||
"https://pub-kitchener.escribemeetings.com/", "SubKitchenerArchive", ""
|
|
||||||
"https://pub-hamilton.escribemeetings.com/", "SubHamiltonArchive", ""
|
|
||||||
"https://pub-brantford.escribemeetings.com/", "SubBrantfordArchive", ""
|
|
||||||
"https://pub-woodstock.escribemeetings.com/", "SubWoodstockArchive", ""
|
|
||||||
"https://pub-stratford.escribemeetings.com/", "SubStratfordArchive", ""
|
|
||||||
"https://pub-chatham-kent.escribemeetings.com/", "SubChathamKentArchive", ""
|
|
||||||
"https://pub-cambridge.escribemeetings.com/", "SubCambridgeArchive", ""
|
|
||||||
"https://pub-vaughan.escribemeetings.com/", "SubVaughanArchive", ""
|
|
||||||
|
|||||||
|
Loading…
Reference in New Issue
Block a user