310 lines
20 KiB
Bash
Executable File
310 lines
20 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
echo -e "\n-========================================================================-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- SCRAPE_PLANAPPS.SH: Downloads planning applications -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- Lillian Skinner -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-========================================================================-"
|
|
|
|
source ./functions/.functions
|
|
|
|
# Warning to all who read this script:
|
|
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
|
|
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
|
|
|
TEMP_DIR="./tmp/"
|
|
SEARCH_PAGE="./tmp/index_pa.html"
|
|
PROJECT_PAGE="./tmp/work_pa.html"
|
|
PROJECT_INFO="./tmp/info.txt"
|
|
PROJECT_ATTACH_NAMES="./tmp/names.txt"
|
|
PROJECT_ATTACH_URLS="./tmp/urls.txt"
|
|
PROJECT_IMAGE_NAMES="./tmp/image-names.txt"
|
|
PROJECT_IMAGE_URLS="./tmp/image-urls.txt"
|
|
|
|
current_year=$(date +%Y)
|
|
current_month=$(date +%m)
|
|
current_day=$(date +%d)
|
|
|
|
#if [ -d "$TEMP_DIR" ]; then
|
|
# rm -r $TEMP_DIR
|
|
#fi
|
|
rm -f $SEARCH_PAGE
|
|
rm -f $PROJECT_PAGE
|
|
|
|
mkdir $TEMP_DIR
|
|
|
|
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
|
|
|
|
j=0
|
|
SEARCH_END=0
|
|
while (( ! SEARCH_END )); do
|
|
echo "-========================================================================-"
|
|
echo "Downloading search results... Page $j"
|
|
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
|
if [ $? -ne 8 ]; then
|
|
PAGE_HAS_APPS=$(cat $SEARCH_PAGE | grep "teaser__title")
|
|
if [[ "$PAGE_HAS_APPS" != "" ]]; then
|
|
while IFS= read -r LINE; do
|
|
|
|
rm -f $PROJECT_INFO
|
|
PAGE_FOUND_APP=$(echo $LINE | grep "teaser__title")
|
|
if [[ "$PAGE_FOUND_APP" != "" ]]; then
|
|
echo "-========================================================================-"
|
|
echo "Task starting on: $(date)"
|
|
|
|
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq)
|
|
PROJECT_URL=$(echo "https://london.ca"$PROJECT_URL)
|
|
|
|
echo "Downloading page..."
|
|
wget --user-agent="$WGET_UA" $PROJECT_URL -O $PROJECT_PAGE --timestamping -q #--show-progress
|
|
|
|
# Removing COVID is due to the naming in the 2020s. Keeping it for revisiting wayback crawls.
|
|
PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-')
|
|
echo " Found project: $PROJECT_NAME"
|
|
|
|
ITEM_MONTH=""
|
|
ITEM_YEAR=""
|
|
_time_parse_helper "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
|
|
if (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
|
|
echo "Last Modified: $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
|
else
|
|
echo "Dates are in the past! Abort."
|
|
break
|
|
fi
|
|
echo "Finding attachments..."
|
|
|
|
rm -f $PROJECT_ATTACH_URLS
|
|
rm -f $PROJECT_ATTACH_NAMES
|
|
rm -f $PROJECT_IMAGE_URLS
|
|
rm -f $PROJECT_IMAGE_NAMES
|
|
|
|
while IFS= read -r PLINE; do
|
|
if (( NEXT_LINE_FITEM )); then
|
|
NEXT_LINE_FITEM=0
|
|
|
|
# Is this line an actual item?
|
|
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
|
|
# Is this line bad data (usually scripts)?
|
|
PROJECT_INFO_IS_BAD=$(echo $PLINE | grep "</script>")
|
|
# Gotta add in the &s and 's.
|
|
PROJECT_INFO_ITEM=$(echo $PLINE | sed 's/.*<div class="field__item">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | uniq)
|
|
if [[ $PROJECT_INFO_IS_ITEMS == "" ]] && [[ $PROJECT_INFO_IS_BAD == "" ]] && [[ $PROJECT_INFO_ITEM != "" ]]; then
|
|
# We'll check to see if a non-info item made it in. Sometimes attachments will get caught, but can be detected by "visually-hidden"
|
|
PROJECT_INFO_LABEL_BAD=$(echo $PROJECT_INFO_LABEL | grep "visually-hidden")
|
|
if [[ $PROJECT_INFO_LABEL_BAD == "" ]]; then
|
|
printf "%-17s: %s\n" "$PROJECT_INFO_LABEL" "$PROJECT_INFO_ITEM" >> $PROJECT_INFO
|
|
if [[ $PROJECT_INFO_LABEL == "File Number" ]]; then
|
|
PROJECT_FILE_NUM_2=""
|
|
PROJECT_FILE_NUM_TYPE_2=""
|
|
PROJECT_FILE_NUM_IS_MULTI=""
|
|
# Multiple file numbers may be listed. We will always use the first one as it is contained in PDF names.
|
|
# I think it takes priority. Anyways, here are the possible formats:
|
|
# XX-#####
|
|
# XX-#####/XX-#####
|
|
# XX-##### / XX-#####
|
|
# XX-##### and XX-#####
|
|
# XX-##### & XX-#####
|
|
#
|
|
# I think the city is allergic to standardization...
|
|
PROJECT_FILE_NUM_IS_MULTI=$(echo $PROJECT_INFO_ITEM | grep -e "and" -e "/" -e "&")
|
|
PROJECT_FILE_NUM=$(echo $PROJECT_INFO_ITEM | sed 's|/.*||' | sed 's| and .*||' | sed 's| & .*||' | sed 's/^[[:space:]]*//g' | sed 's/[[:space:]]*$//g' | uniq)
|
|
PROJECT_FILE_NUM_TYPE=$(echo "$PROJECT_FILE_NUM" | sed 's/^\([^-]*\)-.*$/\1/')
|
|
if [[ "$PROJECT_FILE_NUM_TYPE" == "Line of Sight" ]]; then
|
|
PROJECT_FILE_TYPE="Line of Sight"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE" == "O" ]]; then
|
|
PROJECT_FILE_TYPE="Official Plan Amendment"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE" == "Z" ]]; then
|
|
PROJECT_FILE_TYPE="Zoning By-law Amendment"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE" == "OZ" ]]; then
|
|
PROJECT_FILE_TYPE="Official Plan and Zoning By-law Amendment"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE" == "TZ" ]]; then
|
|
PROJECT_FILE_TYPE="Temporary Zoning By-law Amendment"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE" == "39T" ]]; then
|
|
PROJECT_FILE_TYPE="Draft Plan of Subdivision"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE" == "39CD" ]]; then
|
|
PROJECT_FILE_TYPE="Draft Plan of Condominium"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE" =~ ^SPA2[0-9]+$ ]]; then
|
|
PROJECT_FILE_TYPE="Site Plan Control Application"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE" == "M" ]]; then
|
|
PROJECT_FILE_TYPE="Minor Zoning By-law Amendment"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE" == "H" ]]; then
|
|
PROJECT_FILE_TYPE="Holding Provision By-law Amendment"
|
|
else
|
|
PROJECT_FILE_TYPE="BAD RECORD TYPE"
|
|
fi
|
|
echo "Found file# : $PROJECT_FILE_NUM ($PROJECT_FILE_TYPE)"
|
|
|
|
if [[ "$PROJECT_FILE_NUM_IS_MULTI" != "" ]]; then
|
|
# It isn't great, but if a project has 2 file numbers then we'll save it as both.
|
|
# I'm not sure how to get around this since I don't have a way to tag files.
|
|
PROJECT_FILE_NUM_2=$(echo $PROJECT_INFO_ITEM | sed 's|.*/||' | sed 's|.* and ||' | sed 's|.* & ||' | sed 's/^[[:space:]]*//g' | sed 's/[[:space:]]*$//g' | uniq)
|
|
PROJECT_FILE_NUM_TYPE_2=$(echo "$PROJECT_FILE_NUM_2" | sed 's/^\([^-]*\)-.*$/\1/')
|
|
if [[ "$PROJECT_FILE_NUM_TYPE_2" == "Line of Sight" ]]; then
|
|
PROJECT_FILE_TYPE_2="Line of Sight"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "O" ]]; then
|
|
PROJECT_FILE_TYPE_2="Official Plan Amendment"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "Z" ]]; then
|
|
PROJECT_FILE_TYPE_2="Zoning By-law Amendment"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "OZ" ]]; then
|
|
PROJECT_FILE_TYPE_2="Official Plan and Zoning By-law Amendment"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "TZ" ]]; then
|
|
PROJECT_FILE_TYPE_2="Temporary Zoning By-law Amendment"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "39T" ]]; then
|
|
PROJECT_FILE_TYPE_2="Draft Plan of Subdivision"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "39CD" ]]; then
|
|
PROJECT_FILE_TYPE_2="Draft Plan of Condominium"
|
|
elif [[ "$PROJECT_FILE_NUM_TYPE_2" =~ ^SPA2[0-9]+$ ]]; then
|
|
PROJECT_FILE_TYPE_2="Site Plan Control Application"
|
|
else
|
|
PROJECT_FILE_TYPE_2="BAD RECORD TYPE"
|
|
fi
|
|
echo "Also filed as: $PROJECT_FILE_NUM_2 ($PROJECT_FILE_TYPE_2)"
|
|
fi
|
|
fi
|
|
PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime")
|
|
if [[ $PROJECT_FOUND_TIME != "" ]]; then
|
|
_time_parse_helper "$(echo $PLINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g' | cut -d, -f2- | cut -d\ -f2-)"
|
|
echo "Found date : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
|
fi
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if (( NEXT_LINE_IMAGE )); then
|
|
NEXT_LINE_IMAGE=0
|
|
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
|
|
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
|
|
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
|
|
PROJECT_IMAGE_URL=$(echo "https://london.ca"$PROJECT_IMAGE_URL)
|
|
fi
|
|
PROJECT_IMAGE_NAME=$(echo $PROJECT_IMAGE_URL | sed 's#.*/##p' | uniq)
|
|
echo $PROJECT_IMAGE_URL >> $PROJECT_IMAGE_URLS
|
|
echo $PROJECT_IMAGE_NAME >> $PROJECT_IMAGE_NAMES
|
|
fi
|
|
|
|
PROJECT_FOUND_FILE=$(echo $PLINE | grep "file--mime-application-")
|
|
if [[ $PROJECT_FOUND_FILE != "" ]]; then
|
|
PROJECT_ATTACH_URL=$(echo $PLINE | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq)
|
|
# Newer links are relative paths, so we must add back the domain
|
|
PROJECT_ATTACH_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
|
|
if [[ $PROJECT_ATTACH_URL_SHORT == "" ]];then
|
|
PROJECT_ATTACH_URL=$(echo "https://london.ca"$PROJECT_ATTACH_URL)
|
|
fi
|
|
PROJECT_ATTACH_NAME=$(echo $PLINE | sed 's/.*title="\([^"]*\)".*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | uniq)
|
|
echo $PROJECT_ATTACH_URL >> $PROJECT_ATTACH_URLS
|
|
echo $PROJECT_ATTACH_NAME >> $PROJECT_ATTACH_NAMES
|
|
fi
|
|
|
|
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
|
|
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
|
|
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
|
|
NEXT_LINE_FITEM=1
|
|
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
|
|
# We're setting a flag to let the script know if an upcoming line is contents.
|
|
fi
|
|
|
|
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
|
|
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
|
|
NEXT_LINE_IMAGE=1
|
|
# Same idea as before but for the image shown on the main page.
|
|
fi
|
|
|
|
PROJECT_FOUND_EMAIL=$(echo $PLINE | grep "field--name-field-email" | sed 's/.*href="\([^"]*\)".*/\1/p' | sed 's|^mailto:||' | uniq)
|
|
if [[ "$PROJECT_FOUND_EMAIL" != "" ]]; then
|
|
printf "%-17s: %s\n" "Email" "$PROJECT_FOUND_EMAIL" >> $PROJECT_INFO
|
|
fi
|
|
PROJECT_FOUND_PLANNER=$(echo $PLINE | grep "field--name-name" | sed 's/.*<div[^>]*>\([^<]*\)<[\/:-]div>.*/\1/p' | uniq)
|
|
if [[ "$PROJECT_FOUND_PLANNER" != "" ]]; then
|
|
printf "\n%-17s: %s\n" "Planner" "$PROJECT_FOUND_PLANNER" >> $PROJECT_INFO
|
|
fi
|
|
|
|
done < $PROJECT_PAGE
|
|
echo "Filing away all the datas..."
|
|
|
|
mkdir "./LondonArchive"
|
|
mkdir "./LondonArchive/Planning Applications"
|
|
|
|
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE" ]; then
|
|
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/"
|
|
fi
|
|
|
|
|
|
|
|
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE == $PROJECT_FILE_TYPE_2 ]]; then
|
|
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME" ]; then
|
|
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/"
|
|
fi
|
|
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" ]; then
|
|
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments"
|
|
fi
|
|
PROJECT_SAVE_PATH="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME"
|
|
else
|
|
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME" ]; then
|
|
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/"
|
|
fi
|
|
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/Attachments" ]; then
|
|
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/Attachments"
|
|
fi
|
|
PROJECT_SAVE_PATH="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME"
|
|
fi
|
|
|
|
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
|
|
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2" ]; then
|
|
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/"
|
|
fi
|
|
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME" ]; then
|
|
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/"
|
|
fi
|
|
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" ]; then
|
|
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments"
|
|
fi
|
|
PROJECT_SAVE_PATH_2="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME"
|
|
fi
|
|
|
|
echo "Saving attachments:"
|
|
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
|
#echo " - $LINEA1"
|
|
echo " - $LINEA2"
|
|
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH/Attachments/$LINEA2" --timestamping -q #--show-progress
|
|
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
|
|
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH_2/Attachments/$LINEA2" --timestamping -q #--show-progress
|
|
fi
|
|
done < $PROJECT_ATTACH_URLS 3< $PROJECT_ATTACH_NAMES
|
|
echo "All attachments saved."
|
|
|
|
if [[ "$PROJECT_IMAGE_URL" != "" ]]; then
|
|
PROJECT_IMAGE_URL=""
|
|
echo "Saving photos:"
|
|
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
|
#echo " - $LINEA1"
|
|
echo " - $LINEA2"
|
|
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH/$LINEA2" --timestamping -q #--show-progress
|
|
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
|
|
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH_2/$LINEA2" --timestamping -q #--show-progress
|
|
fi
|
|
done < $PROJECT_IMAGE_URLS 3< $PROJECT_IMAGE_NAMES
|
|
echo "All photos saved."
|
|
fi
|
|
|
|
echo "Extracted info summary:"
|
|
cat $PROJECT_INFO > "$PROJECT_SAVE_PATH/Info.txt"
|
|
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
|
|
cat $PROJECT_INFO > "$PROJECT_SAVE_PATH_2/Info.txt"
|
|
fi
|
|
cat $PROJECT_INFO
|
|
fi
|
|
done < $SEARCH_PAGE
|
|
else
|
|
SEARCH_END=1
|
|
echo "No more pages!"
|
|
fi
|
|
else
|
|
SEARCH_END=1
|
|
echo "No more pages!"
|
|
fi
|
|
((j++))
|
|
done
|