#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
# Todo:
# - Save updates (see bradley-ave)
# - Order, title, and collapse each scraped modal
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
TEMP_DIR="./tmp/"
SEARCH_PAGE="./tmp/index_ginv.html"
PROJECT_PAGE="./tmp/project_ginv.html"
WORK_HTML="./tmp/tmp.html"
CUSTOM_HTML="./tmp/custom_ginv.html"
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
CUSTOM_HTML_UPDATE="./tmp/custom_update_ginv.html"
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
FULLDUMP="./tmp/.fulldump.txt"
rm -f $SEARCH_PAGE
mkdir $TEMP_DIR
SEARCH_URL="https://getinvolvedlondon.ca.engagementhq.com"
wget --user-agent="$WGET_UA" "$SEARCH_URL/projects" -O $SEARCH_PAGE --timestamping -q #--show-progress
if [ $? -ne 8 ]; then
while IFS= read -r LINE; do
if [[ "$PROJECT_NAME" != "" ]]; then
FOUND_DATE=0
echo $PROJECT_URL
echo $PROJECT_NAME
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
# Now we can work on the actual project page.
rm -f $CUSTOM_HTML_LINKS
rm -f $CUSTOM_HTML_PHOTOS
rm -f $CUSTOM_HTML_FAQ
rm -f $CUSTOM_HTML_PROFILE
rm -f $CUSTOM_HTML_UPDATE
rm -f $CUSTOM_HTML_KEYDATES
rm -f $CUSTOM_HTML_SLIDER
rm -f $FULLDUMP
cat ./template/default_getinvolved.html > $CUSTOM_HTML
echo "
$PROJECT_NAME
" >> $CUSTOM_HTML
while IFS= read -r LINE_PROJ; do
if (( IS_DOC_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_DOC_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/documents/")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_LINKS
fi
echo "End of current documents."
elif [[ "$LINE_PROJ" == *"a data-url"* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
echo $LINE_PROJ >> $FULLDUMP
echo $LINE_PROJ
fi
fi
if (( IS_PHOTO_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_PHOTO_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
fi
echo "End of current photos."
else
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *""* ]]; then
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
fi
fi
fi
if (( IS_FAQ_BLOCK )); then
if [[ "$LINE_PROJ" == *"div class='clearfix'>/<\/h3>/g') >> $CUSTOM_HTML
elif [[ "$LINE_PROJ" != *""* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
fi
if (( IS_PROFILE_BLOCK )); then
if [[ "$LINE_PROJ" == *""* ]]; then
IS_SLIDER_BLOCK=0
echo "End of current key dates."
else
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"> $CUSTOM_HTML_SLIDER
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if (( IS_SINGLE_IMAGE_BLOCK )); then
if [[ "$LINE_PROJ" == "" ]]; then
IS_SINGLE_IMAGE_BLOCK=0
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
if [[ "$FOUND_LINK" == "" ]]; then
rm -f $CUSTOM_HTML_PHOTOS
else
cat "$CUSTOM_HTML_PHOTOS"
fi
echo "End of current single image."
else
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
echo $LINE_PROJ >> $CUSTOM_HTML
echo $LINE_PROJ >> $FULLDUMP
fi
fi
fi
if [[ "$LINE_PROJ" == *'div class="full-description hide"'* ]]; then
FIRST_CONTENT=1
# We'll write the LA comment inside of the content block.
# There we can ensure that the comment is only written if content does exist.
echo "Found content start."
if (( FIRST_CONTENT )); then
echo "" >> $FULLDUMP
FIRST_CONTENT=0
fi
echo $(echo " $LINE_PROJ" | sed 's/.*/
/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/