From 9cf9db03e079aad637fef38642603abb4e9d62ab Mon Sep 17 00:00:00 2001 From: Lillian Skinner <56081713+rvtr@users.noreply.github.com> Date: Tue, 17 Jun 2025 17:20:11 -0400 Subject: [PATCH] --- SCRAPE_PLANAPPS.SH | 313 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 SCRAPE_PLANAPPS.SH diff --git a/SCRAPE_PLANAPPS.SH b/SCRAPE_PLANAPPS.SH new file mode 100644 index 0000000..39ae6c5 --- /dev/null +++ b/SCRAPE_PLANAPPS.SH @@ -0,0 +1,313 @@ +#/bash +echo -e "\n-========================================================================-" +echo -e "-=- -=-" +echo -e "-=- SCRAPE_PLANAPPS.SH: Downloads planning applications -=-" +echo -e "-=- -=-" +echo -e "-=- https://gist.github.com/rvtr/******************************** -=-" +echo -e "-=- Lillian Skinner -=-" +echo -e "-=- -=-" +echo -e "-========================================================================-" + +echo "Starting job: SCRAPE_PLANAPPS: $(date)" + +# Warning to all who read this script: +# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works. + +# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! +WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" + +TEMP_DIR="./tmp/" +SEARCH_PAGE="./tmp/index_pa.html" +PROJECT_PAGE="./tmp/work_pa.html" +PROJECT_INFO="./tmp/info.txt" +PROJECT_ATTACH_NAMES="./tmp/names.txt" +PROJECT_ATTACH_URLS="./tmp/urls.txt" +PROJECT_IMAGE_NAMES="./tmp/image-names.txt" +PROJECT_IMAGE_URLS="./tmp/image-urls.txt" + +#if [ -d "$TEMP_DIR" ]; then +# rm -r $TEMP_DIR +#fi +rm -f $SEARCH_PAGE +rm -f $PROJECT_PAGE + +mkdir $TEMP_DIR + +SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications" + +j=0 +SEARCH_END="FALSE" +while [[ $SEARCH_END == "FALSE" ]]; do + echo "SCRAPE_PLANAPPS: -========================================================================-" + echo "SCRAPE_PLANAPPS: Downloading search results... Page $j" + wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress + if [ $? -ne 8 ]; then + PAGE_HAS_APPS=$(cat $SEARCH_PAGE | grep "teaser__title") + if [[ "$PAGE_HAS_APPS" != "" ]]; then + while IFS= read -r LINE; do + + rm -f $PROJECT_INFO + PAGE_FOUND_APP=$(echo $LINE | grep "teaser__title") + if [[ "$PAGE_FOUND_APP" != "" ]]; then + echo "SCRAPE_PLANAPPS: -========================================================================-" + echo "SCRAPE_PLANAPPS: Task starting on: $(date)" + + PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq) + PROJECT_URL=$(echo "https://london.ca"$PROJECT_URL) + + echo "SCRAPE_PLANAPPS: Downloading page..." + wget --user-agent="$WGET_UA" $PROJECT_URL -O $PROJECT_PAGE --timestamping -q #--show-progress + + # Removing COVID is due to the naming in the 2020s. Keeping it for revisiting wayback crawls. + PROJECT_NAME=$(cat $PROJECT_PAGE | grep "field--name-title" | sed 's/.*]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-') + echo "SCRAPE_PLANAPPS: Found project: $PROJECT_NAME" + + echo "SCRAPE_PLANAPPS: Finding attachments..." + + rm -f $PROJECT_ATTACH_URLS + rm -f $PROJECT_ATTACH_NAMES + rm -f $PROJECT_IMAGE_URLS + rm -f $PROJECT_IMAGE_NAMES + + while IFS= read -r PLINE; do + if [[ "$NEXT_LINE_FITEM" == "TRUE" ]]; then + NEXT_LINE_FITEM="FALSE" + + # Is this line an actual item? + PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items") + # Is this line bad data (usually scripts)? + PROJECT_INFO_IS_BAD=$(echo $PLINE | grep "") + # Gotta add in the &s and 's. + PROJECT_INFO_ITEM=$(echo $PLINE | sed 's/.*
\(]*>\)\?\([^<]*\).*/\2/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | uniq) + if [[ $PROJECT_INFO_IS_ITEMS == "" ]] && [[ $PROJECT_INFO_IS_BAD == "" ]] && [[ $PROJECT_INFO_ITEM != "" ]]; then + # We'll check to see if a non-info item made it in. Sometimes attachments will get caught, but can be detected by "visually-hidden" + PROJECT_INFO_LABEL_BAD=$(echo $PROJECT_INFO_LABEL | grep "visually-hidden") + if [[ $PROJECT_INFO_LABEL_BAD == "" ]]; then + printf "%-17s: %s\n" "$PROJECT_INFO_LABEL" "$PROJECT_INFO_ITEM" >> $PROJECT_INFO + if [[ $PROJECT_INFO_LABEL == "File Number" ]]; then + PROJECT_FILE_NUM_2="" + PROJECT_FILE_NUM_TYPE_2="" + PROJECT_FILE_NUM_IS_MULTI="" + # Multiple file numbers may be listed. We will always use the first one as it is contained in PDF names. + # I think it takes priority. Anyways, here are the possible formats: + # XX-##### + # XX-#####/XX-##### + # XX-##### / XX-##### + # XX-##### and XX-##### + # XX-##### & XX-##### + # + # I think the city is allergic to standardization... + PROJECT_FILE_NUM_IS_MULTI=$(echo $PROJECT_INFO_ITEM | grep -e "and" -e "/" -e "&") + PROJECT_FILE_NUM=$(echo $PROJECT_INFO_ITEM | sed 's|/.*||' | sed 's| and .*||' | sed 's| & .*||' | sed 's/^[[:space:]]*//g' | sed 's/[[:space:]]*$//g' | uniq) + PROJECT_FILE_NUM_TYPE=$(echo "$PROJECT_FILE_NUM" | sed 's/^\([^-]*\)-.*$/\1/') + if [[ "$PROJECT_FILE_NUM_TYPE" == "Line of Sight" ]]; then + PROJECT_FILE_TYPE="Line of Sight" + elif [[ "$PROJECT_FILE_NUM_TYPE" == "O" ]]; then + PROJECT_FILE_TYPE="Official Plan Amendment" + elif [[ "$PROJECT_FILE_NUM_TYPE" == "Z" ]]; then + PROJECT_FILE_TYPE="Zoning By-law Amendment" + elif [[ "$PROJECT_FILE_NUM_TYPE" == "OZ" ]]; then + PROJECT_FILE_TYPE="Official Plan and Zoning By-law Amendment" + elif [[ "$PROJECT_FILE_NUM_TYPE" == "TZ" ]]; then + PROJECT_FILE_TYPE="Temporary Zoning By-law Amendment" + elif [[ "$PROJECT_FILE_NUM_TYPE" == "39T" ]]; then + PROJECT_FILE_TYPE="Draft Plan of Subdivision" + elif [[ "$PROJECT_FILE_NUM_TYPE" == "39CD" ]]; then + PROJECT_FILE_TYPE="Draft Plan of Condominium" + elif [[ "$PROJECT_FILE_NUM_TYPE" =~ ^SPA2[0-9]+$ ]]; then + PROJECT_FILE_TYPE="Site Plan Control Application" + else + PROJECT_FILE_TYPE="BAD RECORD TYPE" + fi + echo "SCRAPE_PLANAPPS: Found file# : $PROJECT_FILE_NUM ($PROJECT_FILE_TYPE)" + + if [[ "$PROJECT_FILE_NUM_IS_MULTI" != "" ]]; then + # It isn't great, but if a project has 2 file numbers then we'll save it as both. + # I'm not sure how to get around this since I don't have a way to tag files. + PROJECT_FILE_NUM_2=$(echo $PROJECT_INFO_ITEM | sed 's|.*/||' | sed 's|.* and ||' | sed 's|.* & ||' | sed 's/^[[:space:]]*//g' | sed 's/[[:space:]]*$//g' | uniq) + PROJECT_FILE_NUM_TYPE_2=$(echo "$PROJECT_FILE_NUM_2" | sed 's/^\([^-]*\)-.*$/\1/') + if [[ "$PROJECT_FILE_NUM_TYPE_2" == "Line of Sight" ]]; then + PROJECT_FILE_TYPE_2="Line of Sight" + elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "O" ]]; then + PROJECT_FILE_TYPE_2="Official Plan Amendment" + elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "Z" ]]; then + PROJECT_FILE_TYPE_2="Zoning By-law Amendment" + elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "OZ" ]]; then + PROJECT_FILE_TYPE_2="Official Plan and Zoning By-law Amendment" + elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "TZ" ]]; then + PROJECT_FILE_TYPE_2="Temporary Zoning By-law Amendment" + elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "39T" ]]; then + PROJECT_FILE_TYPE_2="Draft Plan of Subdivision" + elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "39CD" ]]; then + PROJECT_FILE_TYPE_2="Draft Plan of Condominium" + elif [[ "$PROJECT_FILE_NUM_TYPE_2" =~ ^SPA2[0-9]+$ ]]; then + PROJECT_FILE_TYPE_2="Site Plan Control Application" + else + PROJECT_FILE_TYPE_2="BAD RECORD TYPE" + fi + echo "SCRAPE_PLANAPPS: Also filed as: $PROJECT_FILE_NUM_2 ($PROJECT_FILE_TYPE_2)" + fi + fi + PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime") + if [[ $PROJECT_FOUND_TIME != "" ]]; then + PROJECT_TIME_YEAR=$(echo $PLINE | sed 's/.*\([0-9]\{4\}\).*/\1/p' | uniq) + PROJECT_TIME_MONTH_WORD=$(echo $PLINE | sed 's/.*,\s*\([A-Za-z]*\)\s[0-9]\{1,2\},.*/\1/p' | uniq) + PROJECT_TIME_DAY_SHORT=$(echo $PLINE | sed 's/.*,\s*[A-Za-z]*\s\([0-9]\{1,2\}\),.*/\1/p' | uniq) + PROJECT_TIME_DAY=$(printf "%02d" $PROJECT_TIME_DAY_SHORT) + case "$PROJECT_TIME_MONTH_WORD" in + January) PROJECT_TIME_MONTH="01" ;; + February) PROJECT_TIME_MONTH="02" ;; + March) PROJECT_TIME_MONTH="03" ;; + April) PROJECT_TIME_MONTH="04" ;; + May) PROJECT_TIME_MONTH="05" ;; + June) PROJECT_TIME_MONTH="06" ;; + July) PROJECT_TIME_MONTH="07" ;; + August) PROJECT_TIME_MONTH="08" ;; + September) PROJECT_TIME_MONTH="09" ;; + October) PROJECT_TIME_MONTH="10" ;; + November) PROJECT_TIME_MONTH="11" ;; + December) PROJECT_TIME_MONTH="12" ;; + *) PROJECT_TIME_MONTH="--" ;; + esac + echo "SCRAPE_PLANAPPS: Found date : $PROJECT_TIME_YEAR/$PROJECT_TIME_MONTH/$PROJECT_TIME_DAY" + fi + fi + fi + fi + + if [[ "$NEXT_LINE_IMAGE" == "TRUE" ]]; then + NEXT_LINE_IMAGE="FALSE" + PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq) + PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca") + if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then + PROJECT_IMAGE_URL=$(echo "https://london.ca"$PROJECT_IMAGE_URL) + fi + PROJECT_IMAGE_NAME=$(echo $PROJECT_IMAGE_URL | sed 's#.*/##p' | uniq) + echo $PROJECT_IMAGE_URL >> $PROJECT_IMAGE_URLS + echo $PROJECT_IMAGE_NAME >> $PROJECT_IMAGE_NAMES + fi + + PROJECT_FOUND_FILE=$(echo $PLINE | grep "file--mime-application-") + if [[ $PROJECT_FOUND_FILE != "" ]]; then + PROJECT_ATTACH_URL=$(echo $PLINE | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq) + # Newer links are relative paths, so we must add back the domain + PROJECT_ATTACH_URL_SHORT=$(echo $PLINE | grep "https://london.ca") + if [[ $PROJECT_ATTACH_URL_SHORT == "" ]];then + PROJECT_ATTACH_URL=$(echo "https://london.ca"$PROJECT_ATTACH_URL) + fi + PROJECT_ATTACH_NAME=$(echo $PLINE | sed 's/.*title="\([^"]*\)".*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | uniq) + echo $PROJECT_ATTACH_URL >> $PROJECT_ATTACH_URLS + echo $PROJECT_ATTACH_NAME >> $PROJECT_ATTACH_NAMES + fi + + PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label") + if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then + PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*
\(]*>\)\?\([^<]*\).*/\2/p' | uniq) + NEXT_LINE_FITEM="TRUE" + # Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol) + # We're setting a flag to let the script know if an upcoming line is contents. + fi + + PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image") + if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then + NEXT_LINE_IMAGE="TRUE" + # Same idea as before but for the image shown on the main page. + fi + + PROJECT_FOUND_EMAIL=$(echo $PLINE | grep "field--name-field-email" | sed 's/.*href="\([^"]*\)".*/\1/p' | sed 's|^mailto:||' | uniq) + if [[ "$PROJECT_FOUND_EMAIL" != "" ]]; then + printf "%-17s: %s\n" "Email" "$PROJECT_FOUND_EMAIL" >> $PROJECT_INFO + fi + PROJECT_FOUND_PLANNER=$(echo $PLINE | grep "field--name-name" | sed 's/.*]*>\([^<]*\)<[\/:-]div>.*/\1/p' | uniq) + if [[ "$PROJECT_FOUND_PLANNER" != "" ]]; then + printf "\n%-17s: %s\n" "Planner" "$PROJECT_FOUND_PLANNER" >> $PROJECT_INFO + fi + + done < $PROJECT_PAGE + echo "SCRAPE_PLANAPPS: Filing away all the datas..." + + mkdir "./LondonArchive" + mkdir "./LondonArchive/Planning Applications" + + if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE" ]; then + mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/" + fi + + + + if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE == $PROJECT_FILE_TYPE_2 ]]; then + if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME" ]; then + mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/" + fi + if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" ]; then + mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" + fi + PROJECT_SAVE_PATH="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME" + else + if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME" ]; then + mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/" + fi + if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/Attachments" ]; then + mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/Attachments" + fi + PROJECT_SAVE_PATH="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME" + fi + + if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then + if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2" ]; then + mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/" + fi + if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME" ]; then + mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/" + fi + if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" ]; then + mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" + fi + PROJECT_SAVE_PATH_2="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME" + fi + + echo "SCRAPE_PLANAPPS: Saving attachments:" + while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do + #echo " - $LINEA1" + echo "SCRAPE_PLANAPPS: - $LINEA2" + wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH/Attachments/$LINEA2" --timestamping -q #--show-progress + if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then + wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH_2/Attachments/$LINEA2" --timestamping -q #--show-progress + fi + done < $PROJECT_ATTACH_URLS 3< $PROJECT_ATTACH_NAMES + echo "SCRAPE_PLANAPPS: All attachments saved." + + if [[ "$PROJECT_IMAGE_URL" != "" ]]; then + PROJECT_IMAGE_URL="" + echo "SCRAPE_PLANAPPS: Saving photos:" + while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do + #echo " - $LINEA1" + echo "SCRAPE_PLANAPPS: - $LINEA2" + wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH/$LINEA2" --timestamping -q #--show-progress + if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then + wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH_2/$LINEA2" --timestamping -q #--show-progress + fi + done < $PROJECT_IMAGE_URLS 3< $PROJECT_IMAGE_NAMES + echo "SCRAPE_PLANAPPS: All photos saved." + fi + + echo "SCRAPE_PLANAPPS: Extracted info summary:" + cat $PROJECT_INFO > "$PROJECT_SAVE_PATH/Info.txt" + if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then + cat $PROJECT_INFO > "$PROJECT_SAVE_PATH_2/Info.txt" + fi + cat $PROJECT_INFO + fi + done < $SEARCH_PAGE + else + SEARCH_END="TRUE" + echo "SCRAPE_PLANAPPS: No more pages!" + fi + else + SEARCH_END="TRUE" + echo "SCRAPE_PLANAPPS: No more pages!" + fi + ((j++)) +done + +echo "Done job: SCRAPE_PLANAPPS: $(date)"