302 lines
15 KiB
Bash
Executable File
302 lines
15 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
echo -e "\n-========================================================================-"
|
||
echo -e "-=- -=-"
|
||
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
|
||
echo -e "-=- -=-"
|
||
echo -e "-=- Lillian Skinner -=-"
|
||
echo -e "-=- -=-"
|
||
echo -e "-========================================================================-"
|
||
|
||
source ./functions/.functions
|
||
|
||
# Todo:
|
||
# - Save updates (see bradley-ave)
|
||
# - Order, title, and collapse each scraped modal
|
||
|
||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||
|
||
TEMP_DIR="./tmp/"
|
||
SEARCH_PAGE="./tmp/index_ginv.html"
|
||
PROJECT_PAGE="./tmp/project_ginv.html"
|
||
WORK_HTML="./tmp/tmp.html"
|
||
CUSTOM_HTML="./tmp/custom_ginv.html"
|
||
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
|
||
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
|
||
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
|
||
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
|
||
CUSTOM_HTML_UPDATE="./tmp/custom_update_ginv.html"
|
||
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
|
||
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
|
||
FULLDUMP="./tmp/.fulldump.txt"
|
||
|
||
rm -f $SEARCH_PAGE
|
||
|
||
mkdir $TEMP_DIR
|
||
|
||
SEARCH_URL="https://getinvolvedlondon.ca.engagementhq.com"
|
||
|
||
wget --user-agent="$WGET_UA" "$SEARCH_URL/projects" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||
if [ $? -ne 8 ]; then
|
||
while IFS= read -r LINE; do
|
||
|
||
if [[ "$PROJECT_NAME" != "" ]]; then
|
||
FOUND_DATE=0
|
||
echo $PROJECT_URL
|
||
echo $PROJECT_NAME
|
||
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
|
||
# Now we can work on the actual project page.
|
||
rm -f $CUSTOM_HTML_LINKS
|
||
rm -f $CUSTOM_HTML_PHOTOS
|
||
rm -f $CUSTOM_HTML_FAQ
|
||
rm -f $CUSTOM_HTML_PROFILE
|
||
rm -f $CUSTOM_HTML_UPDATE
|
||
rm -f $CUSTOM_HTML_KEYDATES
|
||
rm -f $CUSTOM_HTML_SLIDER
|
||
rm -f $FULLDUMP
|
||
|
||
cat ./template/default_getinvolved.html > $CUSTOM_HTML
|
||
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
|
||
while IFS= read -r LINE_PROJ; do
|
||
|
||
if (( IS_DOC_BLOCK )); then
|
||
if [[ "$LINE_PROJ" == "</ul>" ]]; then
|
||
IS_DOC_BLOCK=0
|
||
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/documents/")
|
||
if [[ "$FOUND_LINK" == "" ]]; then
|
||
rm -f $CUSTOM_HTML_LINKS
|
||
fi
|
||
echo "End of current documents."
|
||
elif [[ "$LINE_PROJ" == *"a data-url"* ]]; then
|
||
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
|
||
echo $LINE_PROJ >> $FULLDUMP
|
||
echo $LINE_PROJ
|
||
fi
|
||
fi
|
||
|
||
if (( IS_PHOTO_BLOCK )); then
|
||
if [[ "$LINE_PROJ" == "" ]]; then
|
||
IS_PHOTO_BLOCK=0
|
||
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||
if [[ "$FOUND_LINK" == "" ]]; then
|
||
rm -f $CUSTOM_HTML_PHOTOS
|
||
fi
|
||
echo "End of current photos."
|
||
else
|
||
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
|
||
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
if (( IS_FAQ_BLOCK )); then
|
||
if [[ "$LINE_PROJ" == *"div class='clearfix'></div"* ]]; then
|
||
IS_FAQ_BLOCK=0
|
||
echo "End of current FAQ."
|
||
else
|
||
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
|
||
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
|
||
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
|
||
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||
echo $LINE_PROJ >> $FULLDUMP
|
||
fi
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
if (( IS_PROFILE_BLOCK )); then
|
||
if [[ "$LINE_PROJ" == *"<!--[if IE]>"* ]]; then
|
||
IS_PROFILE_BLOCK=0
|
||
echo "End of current profile."
|
||
else
|
||
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||
echo $LINE_PROJ >> $FULLDUMP
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
if (( IS_UPDATE_BLOCK )); then
|
||
if [[ "$LINE_PROJ" == *"<div class='clearfix'></div>"* ]]; then
|
||
IS_UPDATE_BLOCK=0
|
||
echo "End of current update."
|
||
else
|
||
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
|
||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||
echo $LINE_PROJ >> $FULLDUMP
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
if (( IS_KEYDATES_BLOCK )); then
|
||
if [[ "$LINE_PROJ" == "" ]]; then
|
||
IS_KEYDATES_BLOCK=0
|
||
echo "End of current key dates."
|
||
else
|
||
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
|
||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||
echo $LINE_PROJ >> $FULLDUMP
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
if (( IS_SLIDER_BLOCK )); then
|
||
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
|
||
IS_SLIDER_BLOCK=0
|
||
echo "End of current key dates."
|
||
else
|
||
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
|
||
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
|
||
echo $LINE_PROJ >> $FULLDUMP
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
if (( IS_SINGLE_IMAGE_BLOCK )); then
|
||
if [[ "$LINE_PROJ" == "" ]]; then
|
||
IS_SINGLE_IMAGE_BLOCK=0
|
||
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||
if [[ "$FOUND_LINK" == "" ]]; then
|
||
rm -f $CUSTOM_HTML_PHOTOS
|
||
else
|
||
cat "$CUSTOM_HTML_PHOTOS"
|
||
fi
|
||
echo "End of current single image."
|
||
else
|
||
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
|
||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||
echo $LINE_PROJ >> $FULLDUMP
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
if [[ "$LINE_PROJ" == *'div class="full-description hide"'* ]]; then
|
||
FIRST_CONTENT=1
|
||
# We'll write the LA comment inside of the content block.
|
||
# There we can ensure that the comment is only written if content does exist.
|
||
echo "Found content start."
|
||
|
||
if (( FIRST_CONTENT )); then
|
||
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
|
||
FIRST_CONTENT=0
|
||
fi
|
||
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
|
||
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
|
||
|
||
elif [[ "$LINE_PROJ" == *"widget-wrap widget_document_library"* ]]; then
|
||
IS_DOC_BLOCK=1
|
||
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
|
||
echo "Found documents start."
|
||
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
|
||
IS_PHOTO_BLOCK=1
|
||
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
|
||
echo "Found photos start."
|
||
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_recent_photos'"* ]]; then
|
||
IS_FAQ_BLOCK=1
|
||
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
|
||
echo "Found FAQ start."
|
||
elif [[ "$LINE_PROJ" == *"widget-wrap widget_project_team"* ]]; then
|
||
IS_PROFILE_BLOCK=1
|
||
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
|
||
echo "Found profile start."
|
||
elif [[ "$LINE_PROJ" == *"<div class='fr-view'>"* ]]; then
|
||
IS_UPDATE_BLOCK=1
|
||
echo "<!-- LondonArchive_GINV_Update -->" >> $FULLDUMP
|
||
echo "<h1>Project Updates</h1>" >> $CUSTOM_HTML_UPDATE
|
||
echo "Found update start."
|
||
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_life_cycle'"* ]]; then
|
||
IS_KEYDATES_BLOCK=1
|
||
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
|
||
echo "Found key dates start."
|
||
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
|
||
IS_SLIDER_BLOCK=1
|
||
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
|
||
echo "Found slider start."
|
||
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
|
||
IS_SINGLE_IMAGE_BLOCK=1
|
||
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
|
||
echo "Found single image start."
|
||
fi
|
||
done < $PROJECT_PAGE
|
||
|
||
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
|
||
cat "$CUSTOM_HTML_LINKS" # >> "$CUSTOM_HTML"
|
||
|
||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
|
||
|
||
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
|
||
while IFS= read -r LINE_DOC; do
|
||
if [[ "$LINE_DOC" == *"/documents/"* ]]; then
|
||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||
DOC_NAME="$(echo $LINE_DOC | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/g' | sed 's/ (pdf)//' | sed 's/^ +| +$//g').pdf"
|
||
echo "-------- "$DOC_NAME
|
||
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')/download" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||
fi
|
||
done < $CUSTOM_HTML_LINKS
|
||
fi
|
||
|
||
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
|
||
while IFS= read -r LINE_DOC; do
|
||
if [[ "$LINE_DOC" == *"ehq-production"* ]]; then
|
||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
|
||
echo "======== "$DOC_NAME
|
||
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||
fi
|
||
done < $CUSTOM_HTML_PHOTOS
|
||
fi
|
||
|
||
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
|
||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||
cat ./template/default_getinvolved.html > $WORK_HTML
|
||
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
|
||
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
|
||
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
|
||
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
|
||
fi
|
||
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
|
||
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
|
||
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
|
||
|
||
IS_DOC_BLOCK=0
|
||
IS_PHOTO_BLOCK=0
|
||
IS_FAQ_BLOCK=0
|
||
IS_PROFILE_BLOCK=0
|
||
IS_UPDATE_BLOCK=0
|
||
IS_KEYDATES_BLOCK=0
|
||
IS_SLIDER_BLOCK=0
|
||
IS_SINGLE_IMAGE_BLOCK=0
|
||
|
||
PROJECT_NAME=""
|
||
fi
|
||
|
||
if (( NEXT_LINE_CONT_NAME )); then
|
||
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LAST_LINE$LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's/‘//g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's/’//g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/"//g' | sed 's/&/and/g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
|
||
NEXT_LINE_CONT_NAME=0
|
||
echo $PROJECT_NAME
|
||
fi
|
||
|
||
if [[ "$LINE" == *"project-tile__meta__name"* ]]; then
|
||
if [[ "$LINE" != *"</span"* ]]; then
|
||
NEXT_LINE_CONT_NAME=1
|
||
LAST_LINE=$LINE
|
||
else
|
||
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's/‘//g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's/’//g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/"//g' | sed 's/&/and/g' | sed 's/'//g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
|
||
echo $PROJECT_NAME
|
||
fi
|
||
fi
|
||
|
||
if [[ "$LINE" == *"project-tile__link"* ]]; then
|
||
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
|
||
PROJECT_URL=$(echo $SEARCH_URL$PROJECT_URL)
|
||
echo " "$PROJECT_URL
|
||
# Reset project name to mark the start of a new project
|
||
PROJECT_NAME=""
|
||
fi
|
||
|
||
done < $SEARCH_PAGE
|
||
fi
|