48 lines
2.2 KiB
Bash
Executable File
48 lines
2.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
echo -e "\n-========================================================================-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- Lillian Skinner -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-========================================================================-"
|
|
|
|
source ./functions/.functions
|
|
|
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
|
|
|
TEMP_DIR="./tmp/"
|
|
SEARCH_PAGE="./tmp/index_mpas.html"
|
|
|
|
rm -f $SEARCH_PAGE
|
|
|
|
mkdir $TEMP_DIR
|
|
|
|
SEARCH_URL="https://london.ca/government/council-civic-administration/master-plans-strategies/plans-strategies"
|
|
|
|
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
|
if [ $? -ne 8 ]; then
|
|
cat "$SEARCH_PAGE" | sed 's/></>\n</g' | \
|
|
while IFS= read -r LINE; do
|
|
if (( LAST_LINE_SUMMARY )) && [[ "$LINE" == "<span>"* ]]; then
|
|
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<span>\([^<]*\)<\/span>.*/\1/')
|
|
echo $CURRENT
|
|
fi
|
|
|
|
LAST_LINE_SUMMARY=0
|
|
if echo "$LINE" | grep -q '<summary>'; then
|
|
LAST_LINE_SUMMARY=1
|
|
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<summary>\([^<]*\)<\/summary>.*/\1/')
|
|
echo $CURRENT
|
|
fi
|
|
|
|
if [[ "$LINE" == *".pdf"* ]]; then
|
|
FILE="$(echo "$LINE" | grep -o 'href="[^"]*\.pdf"' | sed 's/^href="//; s/"$//; s#^https://london\.ca##' | sed 's/%20/ /g' | sed 's/%27//g')" # Fix stupid sublime syntax highlighting: '
|
|
echo $FILE
|
|
mkdir -p "./LondonArchive/Master Plans and Strategies/$CURRENT/"
|
|
_utils_download_helper "https://london.ca$FILE" "./LondonArchive/Master Plans and Strategies/$CURRENT/$(basename "$FILE")"
|
|
fi
|
|
done
|
|
fi
|