LondonScrapers/SCRAPE_JOB.SH

128 lines
5.8 KiB
Bash

#!/usr/bin/env bash
start_timer() {
START_SECONDS=$(date +%s)
START_READABLE=$(date "+%Y-%m-%d %H:%M:%S")
}
end_timer() {
END_SECONDS=$(date +%s)
END_READABLE=$(date "+%Y-%m-%d %H:%M:%S")
ELAPSED_SECONDS=$((END_SECONDS - START_SECONDS))
ELAPSED_READABLE=$(printf "%02d:%02d:%02d" \
$((ELAPSED_SECONDS/3600)) \
$(( (ELAPSED_SECONDS%3600)/60 )) \
$((ELAPSED_SECONDS%60)))
}
push_log() {
PROCURL=$(basename $(echo $1))
echo "<tr> \
<td bgcolor='#bababa' style='color:black;'><a href=\"./${PROCURL}\">View</td> \
<td bgcolor='#bababa' style='color:black;'>$2</td> \
<td bgcolor='#bababa' style='color:black;'>$START_READABLE</td> \
<td bgcolor='#bababa' style='color:black;'>$END_READABLE</td> \
<td bgcolor='#bababa' style='color:black;'>$ELAPSED_READABLE</td> \
</tr>" >> $CRON_LOG_INDEX
}
push_webhook() {
PROCURL=$(basename "$2")
WEBHOOK_URL="https://discord.com/api/webhooks/1472056322886209600/8EtHDzTdVYuaU2mn0-fY6BZZwxW4ZMkNnGzFyTCJhcS6FMHYagjxeyw0rw9o5S-TNRRA"
WEBHOOK_JSON=$(cat <<EOF
{
"embeds": [{
"color": 19712,
"title": "$1",
"url": "https://mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/$PROCURL",
"description": "$3",
"footer": {
"text": "Start: $4, time elapsed: $5"
}
}]
}
EOF
)
curl \
-H "Content-Type: application/json" \
-d "$WEBHOOK_JSON" \
"$WEBHOOK_URL"
}
export datestamp=$(date +'%Y%m%d')
export CRON_LOG_DIR="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs"
export CRON_LOG_INDEX="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/index.html"
export CRON_LOG_SCRIPT="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_script.txt"
export CRON_LOG_MEET="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_meet.txt"
export CRON_LOG_PLAN="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_plan.txt"
export CRON_LOG_JSON="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_json.txt"
export CRON_LOG_LTC="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_ltc.txt"
export CRON_LOG_LPS="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_lps.txt"
export CRON_LOG_S3="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_s3.txt"
export CRON_LOG_SITEMAP="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_sitemap.txt"
mkdir "$CRON_LOG_DIR"
if [ ! -f "$CRON_LOG_INDEX" ]; then
cp "./template/logdir.html" "$CRON_LOG_INDEX"
fi
# Separate timer for main job.
START_SECONDS_ALL=$(date +%s)
START_READABLE_ALL=$(date "+%Y-%m-%d %H:%M:%S")
push_webhook "Start scrape job" "$CRON_LOG_INDEX" "Starting London Archive scrapers." "$START_READABLE_ALL" "N/A"
mkdir "./LondonArchive"
mkdir "./LondonArchive/Meetings"
mkdir "./LondonArchive/Meetings (JSON)"
mkdir "./LondonArchive/Planning Applications"
mkdir "./LondonArchive/LTC"
mkdir "./LondonArchive/LPS"
start_timer
# Back up scripts regularly.
mkdir "./SCRIPTS"
cp *.SH "./SCRIPTS/"
cp *.TXT "./SCRIPTS/"
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "SCRIPTS.7z" "./SCRIPTS" -mhe=on
rm -r "./SCRIPTS"
mv "SCRIPTS.7z" "./LondonArchive/Log_${datestamp}.7z"
echo "This log is private. However, all other logs are public." >> $CRON_LOG_SCRIPT
end_timer && push_log "$CRON_LOG_SCRIPT" "BACK UP SCRIPT"
start_timer
./SCRAPE_MEET.SH >> $CRON_LOG_MEET
end_timer && push_log "$CRON_LOG_MEET" "SCRAPE_MEET.SH" && push_webhook "SCRAPE_MEET.SH" "$CRON_LOG_MEET" "Done processing city meetings." "$START_READABLE" "$ELAPSED_READABLE"
#start_timer
#./SCRAPE_PLAN.SH >> $CRON_LOG_PLAN
#end_timer && push_log "$CRON_LOG_PLAN" "SCRAPE_PLAN.SH" && push_webhook "SCRAPE_PLAN.SH" #"$CRON_LOG_PLAN" "Done processing planning applications." "$START_READABLE" "$ELAPSED_READABLE"
start_timer
./SCRAPE_ESCRIBE.SH >> $CRON_LOG_JSON
end_timer && push_log "$CRON_LOG_JSON" "SCRAPE_JSON.SH" && push_webhook "SCRAPE_JSON.SH" "$CRON_LOG_JSON" "Done backing up eScribe meeting lists." "$START_READABLE" "$ELAPSED_READABLE"
start_timer
./SCRAPE_LTC.SH >> $CRON_LOG_LTC
end_timer && push_log "$CRON_LOG_LTC" "SCRAPE_LTC.SH" && push_webhook "SCRAPE_LTC.SH" "$CRON_LOG_LTC" "Done processing LTC meetings." "$START_READABLE" "$ELAPSED_READABLE"
start_timer
./SCRAPE_LPS.SH >> $CRON_LOG_LPS
end_timer && push_log "$CRON_LOG_LPS" "SCRAPE_LTC.SH" && push_webhook "SCRAPE_LPS.SH" "$CRON_LOG_LPS" "Done processing LPS meetings." "$START_READABLE" "$ELAPSED_READABLE"
start_timer
aws s3 sync ./LondonArchive "s3://public-file-browser-files-0261cd08327d/" --profile london --no-progress --size-only >> $CRON_LOG_S3
end_timer && push_log "$CRON_LOG_S3" "AWS S3 SYNC" && push_webhook "AWS S3 SYNC" "$CRON_LOG_S3" "Done syncing files to S3." "$START_READABLE" "$ELAPSED_READABLE"
rm -rf "./LondonArchive"
mkdir "./LondonArchive"
start_timer
# Make/upload sitemap AFTER clearing the work dir. Otherwise everything gets uploaded again.
./MAKE_SITEMAP.SH >> $CRON_LOG_SITEMAP
end_timer && push_log "$CRON_LOG_SITEMAP" "MAKE_SITEMAP.SH" && push_webhook "MAKE_SITEMAP.SH" "$CRON_LOG_SITEMAP" "Done updating archive sitemap, requested YaCy indexing." "$START_READABLE" "$ELAPSED_READABLE"
rm -rf "./LondonArchive"
END_SECONDS_ALL=$(date +%s)
ELAPSED_SECONDS_ALL=$((END_SECONDS_ALL - START_SECONDS_ALL))
ELAPSED_READABLE_ALL=$(printf "%02d:%02d:%02d" \
$((ELAPSED_SECONDS_ALL/3600)) \
$(( (ELAPSED_SECONDS_ALL%3600)/60 )) \
$((ELAPSED_SECONDS_ALL%60)))
push_webhook "Finished scrape job" "$CRON_LOG_INDEX" "Archive is now fully updated." "$START_READABLE_ALL" "$ELAPSED_READABLE_ALL"