Upload files to "/"
This commit is contained in:
commit
ec46f77bc4
99
SCRAPE_ESCRIBE.SH
Normal file
99
SCRAPE_ESCRIBE.SH
Normal file
@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_ESCRIBE.SH: Download eScribe meetings JSONs -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
# Warning to all who read this script:
|
||||
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
||||
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
TEMP_DIR="./tmp/"
|
||||
INDEX_PAGE="./tmp/index_cal.html"
|
||||
SEARCH_PAGE="./tmp/search.html"
|
||||
AGENDA_HTML="./tmp/work.html"
|
||||
ADDENDUM_HTML="./tmp/addendum.html"
|
||||
#VIDEO_TIMESTAMP_JSON="./tmp/time_cal.json"
|
||||
|
||||
current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)
|
||||
|
||||
if [ -d "$TEMP_DIR" ]; then
|
||||
rm -r $TEMP_DIR
|
||||
fi
|
||||
rm -f $INDEX_PAGE
|
||||
rm -f $SEARCH_PAGE
|
||||
rm -f $AGENDA_HTML
|
||||
|
||||
mkdir $TEMP_DIR
|
||||
|
||||
while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g')
|
||||
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
|
||||
INDEX_END="FALSE"
|
||||
while [[ $INDEX_END == "FALSE" ]]; do
|
||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
FOUNDLIST="FALSE"
|
||||
while IFS= read -r LINE; do
|
||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||
if [[ "$GREPENDLIST" == "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: End of list."
|
||||
INDEX_END="TRUE"
|
||||
break
|
||||
else
|
||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||
echo "-========================================================================-"
|
||||
echo "- "$MEETING_NAME
|
||||
# Pages start at 1. Ew.
|
||||
x=1
|
||||
|
||||
mkdir "${CITY_ARCHIVE_NAME}"
|
||||
mkdir "${CITY_ARCHIVE_NAME}/Meetings (JSON)/"
|
||||
mkdir "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}"
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json"
|
||||
y=0
|
||||
i=0
|
||||
NUM_MEETINGS=$(cat "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json" | jq '.d.TotalCount')
|
||||
while (true); do
|
||||
NUM_IN_JSON=$(cat "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json" | jq '.d.Meetings | length' )
|
||||
|
||||
# Decrease in the meeting count == we're on the final page.
|
||||
if (( $i >= $NUM_IN_JSON )) && (( $NUM_IN_JSON >= 50)); then
|
||||
((x++))
|
||||
i=0
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json"
|
||||
elif (( $i >= $NUM_IN_JSON )); then
|
||||
break
|
||||
fi
|
||||
|
||||
#echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
|
||||
|
||||
((i++))
|
||||
((y++))
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||
if [[ "$GREPLIST" != "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||
FOUNDLIST="TRUE"
|
||||
fi
|
||||
done < $INDEX_PAGE
|
||||
else
|
||||
INDEX_END="TRUE"
|
||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||
fi
|
||||
done
|
||||
done < websites.csv
|
||||
127
SCRAPE_JOB.SH
Normal file
127
SCRAPE_JOB.SH
Normal file
@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env bash
|
||||
start_timer() {
|
||||
START_SECONDS=$(date +%s)
|
||||
START_READABLE=$(date "+%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
end_timer() {
|
||||
END_SECONDS=$(date +%s)
|
||||
END_READABLE=$(date "+%Y-%m-%d %H:%M:%S")
|
||||
ELAPSED_SECONDS=$((END_SECONDS - START_SECONDS))
|
||||
ELAPSED_READABLE=$(printf "%02d:%02d:%02d" \
|
||||
$((ELAPSED_SECONDS/3600)) \
|
||||
$(( (ELAPSED_SECONDS%3600)/60 )) \
|
||||
$((ELAPSED_SECONDS%60)))
|
||||
}
|
||||
push_log() {
|
||||
PROCURL=$(basename $(echo $1))
|
||||
echo "<tr> \
|
||||
<td bgcolor='#bababa' style='color:black;'><a href=\"./${PROCURL}\">View</td> \
|
||||
<td bgcolor='#bababa' style='color:black;'>$2</td> \
|
||||
<td bgcolor='#bababa' style='color:black;'>$START_READABLE</td> \
|
||||
<td bgcolor='#bababa' style='color:black;'>$END_READABLE</td> \
|
||||
<td bgcolor='#bababa' style='color:black;'>$ELAPSED_READABLE</td> \
|
||||
</tr>" >> $CRON_LOG_INDEX
|
||||
}
|
||||
push_webhook() {
|
||||
PROCURL=$(basename "$2")
|
||||
WEBHOOK_URL="https://discord.com/api/webhooks/1472056322886209600/8EtHDzTdVYuaU2mn0-fY6BZZwxW4ZMkNnGzFyTCJhcS6FMHYagjxeyw0rw9o5S-TNRRA"
|
||||
WEBHOOK_JSON=$(cat <<EOF
|
||||
{
|
||||
"embeds": [{
|
||||
"color": 19712,
|
||||
"title": "$1",
|
||||
"url": "https://mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/$PROCURL",
|
||||
"description": "$3",
|
||||
"footer": {
|
||||
"text": "Start: $4, time elapsed: $5"
|
||||
}
|
||||
}]
|
||||
}
|
||||
EOF
|
||||
)
|
||||
curl \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$WEBHOOK_JSON" \
|
||||
"$WEBHOOK_URL"
|
||||
}
|
||||
export datestamp=$(date +'%Y%m%d')
|
||||
export CRON_LOG_DIR="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs"
|
||||
export CRON_LOG_INDEX="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/index.html"
|
||||
export CRON_LOG_SCRIPT="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_script.txt"
|
||||
export CRON_LOG_MEET="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_meet.txt"
|
||||
export CRON_LOG_PLAN="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_plan.txt"
|
||||
export CRON_LOG_JSON="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_json.txt"
|
||||
export CRON_LOG_LTC="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_ltc.txt"
|
||||
export CRON_LOG_LPS="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_lps.txt"
|
||||
export CRON_LOG_S3="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_s3.txt"
|
||||
export CRON_LOG_SITEMAP="/var/www/mystery-of-the-typical-tiny-gold-iguana.randommeaninglesscharacters.com/lalogs/${datestamp}_sitemap.txt"
|
||||
mkdir "$CRON_LOG_DIR"
|
||||
|
||||
if [ ! -f "$CRON_LOG_INDEX" ]; then
|
||||
cp "./template/logdir.html" "$CRON_LOG_INDEX"
|
||||
fi
|
||||
|
||||
# Separate timer for main job.
|
||||
START_SECONDS_ALL=$(date +%s)
|
||||
START_READABLE_ALL=$(date "+%Y-%m-%d %H:%M:%S")
|
||||
push_webhook "Start scrape job" "$CRON_LOG_INDEX" "Starting London Archive scrapers." "$START_READABLE_ALL" "N/A"
|
||||
|
||||
mkdir "./LondonArchive"
|
||||
mkdir "./LondonArchive/Meetings"
|
||||
mkdir "./LondonArchive/Meetings (JSON)"
|
||||
mkdir "./LondonArchive/Planning Applications"
|
||||
mkdir "./LondonArchive/LTC"
|
||||
mkdir "./LondonArchive/LPS"
|
||||
|
||||
start_timer
|
||||
# Back up scripts regularly.
|
||||
mkdir "./SCRIPTS"
|
||||
cp *.SH "./SCRIPTS/"
|
||||
cp *.TXT "./SCRIPTS/"
|
||||
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "SCRIPTS.7z" "./SCRIPTS" -mhe=on
|
||||
rm -r "./SCRIPTS"
|
||||
mv "SCRIPTS.7z" "./LondonArchive/Log_${datestamp}.7z"
|
||||
echo "This log is private. However, all other logs are public." >> $CRON_LOG_SCRIPT
|
||||
end_timer && push_log "$CRON_LOG_SCRIPT" "BACK UP SCRIPT"
|
||||
|
||||
start_timer
|
||||
./SCRAPE_MEET.SH >> $CRON_LOG_MEET
|
||||
end_timer && push_log "$CRON_LOG_MEET" "SCRAPE_MEET.SH" && push_webhook "SCRAPE_MEET.SH" "$CRON_LOG_MEET" "Done processing city meetings." "$START_READABLE" "$ELAPSED_READABLE"
|
||||
|
||||
#start_timer
|
||||
#./SCRAPE_PLAN.SH >> $CRON_LOG_PLAN
|
||||
#end_timer && push_log "$CRON_LOG_PLAN" "SCRAPE_PLAN.SH" && push_webhook "SCRAPE_PLAN.SH" #"$CRON_LOG_PLAN" "Done processing planning applications." "$START_READABLE" "$ELAPSED_READABLE"
|
||||
|
||||
start_timer
|
||||
./SCRAPE_ESCRIBE.SH >> $CRON_LOG_JSON
|
||||
end_timer && push_log "$CRON_LOG_JSON" "SCRAPE_JSON.SH" && push_webhook "SCRAPE_JSON.SH" "$CRON_LOG_JSON" "Done backing up eScribe meeting lists." "$START_READABLE" "$ELAPSED_READABLE"
|
||||
|
||||
start_timer
|
||||
./SCRAPE_LTC.SH >> $CRON_LOG_LTC
|
||||
end_timer && push_log "$CRON_LOG_LTC" "SCRAPE_LTC.SH" && push_webhook "SCRAPE_LTC.SH" "$CRON_LOG_LTC" "Done processing LTC meetings." "$START_READABLE" "$ELAPSED_READABLE"
|
||||
|
||||
start_timer
|
||||
./SCRAPE_LPS.SH >> $CRON_LOG_LPS
|
||||
end_timer && push_log "$CRON_LOG_LPS" "SCRAPE_LTC.SH" && push_webhook "SCRAPE_LPS.SH" "$CRON_LOG_LPS" "Done processing LPS meetings." "$START_READABLE" "$ELAPSED_READABLE"
|
||||
|
||||
start_timer
|
||||
aws s3 sync ./LondonArchive "s3://public-file-browser-files-0261cd08327d/" --profile london --no-progress --size-only >> $CRON_LOG_S3
|
||||
end_timer && push_log "$CRON_LOG_S3" "AWS S3 SYNC" && push_webhook "AWS S3 SYNC" "$CRON_LOG_S3" "Done syncing files to S3." "$START_READABLE" "$ELAPSED_READABLE"
|
||||
|
||||
rm -rf "./LondonArchive"
|
||||
mkdir "./LondonArchive"
|
||||
|
||||
start_timer
|
||||
# Make/upload sitemap AFTER clearing the work dir. Otherwise everything gets uploaded again.
|
||||
./MAKE_SITEMAP.SH >> $CRON_LOG_SITEMAP
|
||||
end_timer && push_log "$CRON_LOG_SITEMAP" "MAKE_SITEMAP.SH" && push_webhook "MAKE_SITEMAP.SH" "$CRON_LOG_SITEMAP" "Done updating archive sitemap, requested YaCy indexing." "$START_READABLE" "$ELAPSED_READABLE"
|
||||
|
||||
rm -rf "./LondonArchive"
|
||||
|
||||
END_SECONDS_ALL=$(date +%s)
|
||||
ELAPSED_SECONDS_ALL=$((END_SECONDS_ALL - START_SECONDS_ALL))
|
||||
ELAPSED_READABLE_ALL=$(printf "%02d:%02d:%02d" \
|
||||
$((ELAPSED_SECONDS_ALL/3600)) \
|
||||
$(( (ELAPSED_SECONDS_ALL%3600)/60 )) \
|
||||
$((ELAPSED_SECONDS_ALL%60)))
|
||||
push_webhook "Finished scrape job" "$CRON_LOG_INDEX" "Archive is now fully updated." "$START_READABLE_ALL" "$ELAPSED_READABLE_ALL"
|
||||
76
SCRAPE_LPS.SH
Normal file
76
SCRAPE_LPS.SH
Normal file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- https://gist.github.com/rvtr/******************************** -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
conv_date() {
|
||||
echo "$1"
|
||||
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
||||
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
case "$MEETING_MONTH_WORD" in
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
MEETINGS_PAGE="./tmp.html"
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
mkdir "./LondonArchive"
|
||||
mkdir "./LondonArchive/LPS"
|
||||
mkdir "./tmp"
|
||||
|
||||
wget --user-agent="$WGET_UA" "https://londonpoliceserviceboard.com/board-meetings/" -O "./tmp/index.html" -q #--show-progress
|
||||
|
||||
current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)
|
||||
# If I don't set these values then "10#: invalid integer constant"
|
||||
MEETING_YEAR="0000"
|
||||
MEETING_MONTH="00"
|
||||
MEETING_DAY="00"
|
||||
|
||||
while IFS= read -r LINE_PRE; do
|
||||
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
||||
FOUND_ATTACH_TYPE="$(echo $LINE | grep '<h3 class="fusion-title-heading' | sed 's/.*<h3[^>]*>\([^<]*\)<[\/:-]h3>.*/\1/')"
|
||||
FOUND_YEAR_HEADER="$(echo $LINE | grep 'tabindex="0" aria-labelledby="fusion-tab-' | sed 's/.*aria-labelledby="\([^"]*\)".*/\1/' | sed 's/.*fusion-tab-//')"
|
||||
if [[ "$FOUND_YEAR_HEADER" != "" ]]; then
|
||||
echo $FOUND_ATTACH_TYPE
|
||||
echo $FOUND_YEAR_HEADER
|
||||
YEAR="$FOUND_YEAR_HEADER"
|
||||
if [[ "$FOUND_ATTACH_TYPE" == "Meeting Minutes" ]]; then
|
||||
ATTACH_TYPE="Minutes"
|
||||
elif [[ "$FOUND_ATTACH_TYPE" == "Agenda and Report Packages" ]]; then
|
||||
ATTACH_TYPE="Agenda"
|
||||
fi
|
||||
fi
|
||||
|
||||
FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">')
|
||||
if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then
|
||||
conv_date "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
|
||||
echo "$MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')"
|
||||
mkdir -p "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/"
|
||||
wget "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" -O "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/$ATTACH_TYPE.pdf" -q
|
||||
fi
|
||||
|
||||
done < "./tmp/index.html"
|
||||
184
SCRAPE_LTC.SH
Normal file
184
SCRAPE_LTC.SH
Normal file
@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_LTC.SH: Downloads LTC committee agendas and minutes -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
MEETINGS_PAGE="./tmp.html"
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
mkdir "./LondonArchive"
|
||||
mkdir "./LondonArchive/LTC"
|
||||
mkdir "./tmp"
|
||||
|
||||
wget --user-agent="$WGET_UA" "https://www.londontransit.ca/agendas-and-minutes/" -O "./tmp/index.html" -q #--show-progress
|
||||
|
||||
current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)
|
||||
# If I don't set these values then "10#: invalid integer constant"
|
||||
MEETING_YEAR="0000"
|
||||
MEETING_MONTH="00"
|
||||
MEETING_DAY="00"
|
||||
|
||||
while IFS= read -r LINE_PRE; do
|
||||
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
||||
|
||||
# Only <h2> without a class is the title of a committee. We'll confirm it is the title by checking for "Agendas and Minutes".
|
||||
GREPCOMMITTEE=$(echo $LINE | grep "<h2>" | grep "Agendas and Minutes")
|
||||
GREPDATE=$(echo $LINE | grep "</strong></td>")
|
||||
if [[ "$GREPCOMMITTEE" != "" ]]; then
|
||||
COMMITTEENAME=$(echo $LINE | sed 's/<h2>//' | sed 's/<\/h2>//')
|
||||
echo "NEW COMMITTEE"
|
||||
echo "$COMMITTEENAME"
|
||||
if [[ "$(echo "$COMMITTEENAME" | grep "APTSAC")" != "" ]]; then
|
||||
echo "Committee slug set"
|
||||
COMMITTEENAME_SLUG="Accessible Public Transit Services Advisory Committee"
|
||||
elif [[ "$(echo "$COMMITTEENAME" | grep "Commission")" != "" ]]; then
|
||||
echo "Committee slug set"
|
||||
COMMITTEENAME_SLUG="Commission"
|
||||
fi
|
||||
# Only a marker for a new committee. Do nothing else.
|
||||
GREPCOMMITTEE=""
|
||||
elif [[ "$GREPDATE" != "" ]]; then
|
||||
# Remove HTML junk from date string.
|
||||
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
|
||||
MEETING_MONTH_WORD=$(echo "$DATES_CLEAN" | sed -E 's/^([A-Za-z]+) .*/\1/')
|
||||
MEETING_DAY_SHORT=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/')
|
||||
MEETING_DAY=$(printf "%02d" ${MEETING_DAY_SHORT#0})
|
||||
MEETING_YEAR=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/')
|
||||
|
||||
case "$MEETING_MONTH_WORD" in
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
esac
|
||||
echo " NEW MEETING FOUND"
|
||||
echo " DATE IS $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
GREPDATE=""
|
||||
else
|
||||
# Has a previous meeting has been set? What about a date?
|
||||
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
|
||||
if [[ "COMMITTEENAME" != "" ]] && [[ "MEETING_YEAR" != "" ]] && (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
|
||||
# Not changing meetings, and we know that an old meeting has alread been set. Keep going.
|
||||
|
||||
# If match --> make folder --> download
|
||||
AGENDAURL=$(echo $LINE | grep "PDF Agenda" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
|
||||
AGENDAHTMLURL=$(echo $LINE | grep "HTML Agenda" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
||||
MINUTESURL=$(echo $LINE | grep "PDF Minutes" | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
|
||||
MINUTESHTMLURL=$(echo $LINE | grep "HTML Minutes" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
||||
# Yes, I know there are HTML agendas. LTC is a lower priority, so I am not able to put a bunch of time into making a parser for the HTML. Sorry.
|
||||
# Well... this aged well.
|
||||
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR" 2> /dev/null
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY" 2> /dev/null
|
||||
if [[ "$AGENDAURL" != "" ]]; then
|
||||
echo " DOWNLOAD AGENDA PDF"
|
||||
echo " $AGENDAURL"
|
||||
wget --user-agent="$WGET_UA" "$AGENDAURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Agenda.pdf" -c -q #--show-progress
|
||||
elif [[ "$MINUTESURL" != "" ]]; then
|
||||
echo " DOWNLOAD MINUTES PDF"
|
||||
echo " $MINUTESURL"
|
||||
wget --user-agent="$WGET_UA" "$MINUTESURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Minutes.pdf" -c -q #--show-progress
|
||||
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
|
||||
if [[ "$AGENDAHTMLURL" != "" ]]; then
|
||||
echo " DOWNLOAD AGENDA HTML TO CRAWL"
|
||||
echo " $AGENDAHTMLURL"
|
||||
wget --user-agent="$WGET_UA" "$AGENDAHTMLURL" -O "./tmp/work.html" -q #--show-progress
|
||||
elif [[ "$MINUTESHTMLURL" != "" ]]; then
|
||||
echo " DOWNLOAD MINUTES HTML TO CRAWL"
|
||||
echo " $MINUTESHTMLURL"
|
||||
wget --user-agent="$WGET_UA" "$MINUTESHTMLURL" -O "./tmp/work.html" -q #--show-progress
|
||||
fi
|
||||
while IFS= read -r LINE_HTML_PRE; do
|
||||
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
|
||||
GREPARTICLESTART=$(echo $LINE_HTML | grep "<article")
|
||||
GREPARTICLEEND=$(echo $LINE_HTML | grep "</article>")
|
||||
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
||||
if [[ "$GREPARTICLESTART" != "" ]]; then
|
||||
echo " FOUND INDEX ARTICLE START"
|
||||
ISARTICLE="TRUE"
|
||||
elif [[ "$GREPARTICLEEND" != "" ]]; then
|
||||
echo " END OF INDEX ARTICLE"
|
||||
ISARTICLE=""
|
||||
elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
|
||||
ISPDF=$(echo $GREPLINK | grep "\.pdf")
|
||||
if [[ "$ISPDF" != "" ]]; then
|
||||
PDFNAME=$(echo $ISPDF | sed 's/.*\///')
|
||||
echo " DOWNLOAD ATTACHMENT PDF"
|
||||
echo " $ISPDF"
|
||||
wget --user-agent="$WGET_UA" "$ISPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFNAME" -c -q #--show-progress
|
||||
else
|
||||
# Extract title of attachment
|
||||
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
|
||||
echo " DOWNLOAD ATTACHMENT HTML"
|
||||
echo " $ATTACHTITLE"
|
||||
echo " $GREPLINK"
|
||||
wget --user-agent="$WGET_UA" "$GREPLINK" -O "./tmp/attachment.html" -q #--show-progress
|
||||
while IFS= read -r LINE_ATTACH_PRE; do
|
||||
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
|
||||
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
|
||||
GREPATTACHMENTARTICLEEND=$(echo $LINE_ATTACH | grep "</article>")
|
||||
GREPATTACHMENTLINK=$(echo $LINE_ATTACH | grep "\.pdf" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/.pdf.*/.pdf/')
|
||||
if [[ "$GREPATTACHMENTARTICLESTART" != "" ]]; then
|
||||
echo " FOUND ATTACHMENT ARTICLE START"
|
||||
# CSS for the HTML is in the default template
|
||||
cat ./template/default.html > ./tmp/new.html
|
||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||
ISATTACHMENTARTICLE="TRUE"
|
||||
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
|
||||
echo " END OF ATTACHMENT ARTICLE"
|
||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||
echo " PROCESSED TO PDF"
|
||||
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
|
||||
ISATTACHMENTARTICLE=""
|
||||
elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then
|
||||
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
|
||||
if [[ "$ISREFPDF" != "" ]]; then
|
||||
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
|
||||
echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
|
||||
echo " $GREPATTACHMENTLINK"
|
||||
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
|
||||
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
|
||||
fi
|
||||
elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then
|
||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||
fi
|
||||
LINE_ATTACH=""
|
||||
GREPATTACHMENTARTICLESTART=""
|
||||
GREPATTACHMENTARTICLEEND=""
|
||||
GREPATTACHMENTLINK=""
|
||||
done < ./tmp/attachment.html
|
||||
ISPDF=""
|
||||
fi
|
||||
fi
|
||||
LINE_HTML=""
|
||||
GREPARTICLESTART=""
|
||||
GREPARTICLEEND=""
|
||||
GREPLINK=""
|
||||
done < ./tmp/work.html
|
||||
fi
|
||||
AGENDAURL=""
|
||||
AGENDAHTMLURL=""
|
||||
MINUTESURL=""
|
||||
MINUTESHTMLURL=""
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done < "./tmp/index.html"
|
||||
423
SCRAPE_MEET.SH
Normal file
423
SCRAPE_MEET.SH
Normal file
@ -0,0 +1,423 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
conv_date() {
|
||||
echo "$1"
|
||||
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
||||
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
case "$MEETING_MONTH_WORD" in
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
conv_date_alt() {
|
||||
echo "$1"
|
||||
MEETING_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
|
||||
MEETING_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
|
||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
||||
MEETING_YEAR=$(echo "$1" | sed 's/.* //')
|
||||
case "$MEETING_MONTH_WORD" in
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
set_agenda_url() {
|
||||
case "$1" in
|
||||
'"Agenda (HTML)"')
|
||||
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda (PDF)"')
|
||||
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (HTML)"')
|
||||
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (PDF)"')
|
||||
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (HTML)"')
|
||||
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (PDF)"')
|
||||
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes with Attachments (PDF)"')
|
||||
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
|
||||
'"Agenda Full Package (HTML)"')
|
||||
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Full Package (PDF)"')
|
||||
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (HTML)"')
|
||||
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (PDF)"')
|
||||
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (HTML)"')
|
||||
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (PDF)"')
|
||||
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (HTML)"')
|
||||
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (PDF)"')
|
||||
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
esac
|
||||
}
|
||||
|
||||
clear_agenda_url() {
|
||||
AGENDA_HTML_URL=""
|
||||
AGENDA_PDF_URL=""
|
||||
AGENDA_REVISE_HTML_URL=""
|
||||
AGENDA_REVISE_PDF_URL=""
|
||||
MINUTES_HTML_URL=""
|
||||
MINUTES_PDF_URL=""
|
||||
MINUTES_ATTACH_PDF_URL=""
|
||||
|
||||
AGENDA_FULL_HTML_URL=""
|
||||
AGENDA_FULL_PDF_URL=""
|
||||
AGENDA_COVER_HTML_URL=""
|
||||
AGENDA_COVER_PDF_URL=""
|
||||
AGENDA_POST_HTML_URL=""
|
||||
AGENDA_POST_PDF_URL=""
|
||||
ADDENDUM_HTML_URL=""
|
||||
ADDENDUM_PDF_URL=""
|
||||
}
|
||||
|
||||
download_agendas() {
|
||||
if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then
|
||||
if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then
|
||||
echo "Saving revised agenda as PDF..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$1/Agenda_Revised.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $AGENDA_PDF_URL != "" ]]; then
|
||||
echo "Saving regular agenda as PDF..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$1/Agenda.pdf" -N -q #--show-progress
|
||||
fi
|
||||
elif [[ $AGENDA_REVISE_HTML_URL != "" ]] || [[ $AGENDA_HTML_URL != "" ]]; then
|
||||
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
|
||||
echo "Saving revised agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$1/Agenda_Revised.html" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $AGENDA_HTML_URL != "" ]]; then
|
||||
echo "Saving regular agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$1/Agenda.html" -N -q #--show-progress
|
||||
fi
|
||||
elif [[ $AGENDA_FULL_PDF_URL != "" ]] || [[ $AGENDA_FULL_HTML_URL != "" ]]; then
|
||||
if [[ $AGENDA_FULL_PDF_URL != "" ]]; then
|
||||
echo "Saving full package agenda as PDF... (no HTML found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_PDF_URL" -O "$1/Agenda_FullPackage.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $AGENDA_FULL_HTML_URL != "" ]]; then
|
||||
echo "Saving full package agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O "$1/Agenda_FullPackage.html" -N -q #--show-progress
|
||||
fi
|
||||
elif [[ $AGENDA_POST_PDF_URL != "" ]] || [[ $AGENDA_POST_HTML_URL != "" ]]; then
|
||||
if [[ $AGENDA_POST_PDF_URL != "" ]]; then
|
||||
echo "Saving post agenda as HTML... (no HTML found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_PDF_URL" -O "$1/Agenda_Post.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $AGENDA_POST_HTML_URL != "" ]]; then
|
||||
echo "Saving post agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O "$1/Agenda_Post.html" -N -q #--show-progress
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then
|
||||
if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then
|
||||
echo "Saving minutes with attachments as PDF..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$1/Minutes_With_Attachments.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $MINUTES_PDF_URL != "" ]]; then
|
||||
echo "Saving minutes as PDF..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$1/Minutes.pdf" -N -q #--show-progress
|
||||
fi
|
||||
else
|
||||
if [[ $MINUTES_HTML_URL != "" ]]; then
|
||||
echo "Saving minutes as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$1/Minutes.html" -N -q #--show-progress
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ $AGENDA_COVER_PDF_URL != "" ]]; then
|
||||
echo "Saving cover agenda as PDF... (no HTML found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_PDF_URL" -O "$1/Agenda_Cover.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $AGENDA_COVER_HTML_URL != "" ]]; then
|
||||
echo "Saving cover agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O "$1/Agenda_Cover.html" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $ADDENDUM_PDF_URL != "" ]]; then
|
||||
echo "Saving addendum as PDF... (no HTML found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_PDF_URL" -O "$1/Addendum.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $ADDENDUM_HTML_URL != "" ]]; then
|
||||
echo "Saving addendum as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O "$1/Addendum.html" -N -q #--show-progress
|
||||
fi
|
||||
}
|
||||
|
||||
# Warning to all who read this script:
|
||||
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
||||
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
TEMP_DIR="./tmp/"
|
||||
INDEX_PAGE="./tmp/index.html"
|
||||
SEARCH_PAGE="./tmp/search.html"
|
||||
AGENDA_HTML="./tmp/work.html"
|
||||
ADDENDUM_HTML="./tmp/addendum.html"
|
||||
#VIDEO_TIMESTAMP_JSON="./tmp/time.json"
|
||||
|
||||
current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)00
|
||||
|
||||
SUPPORT_PAST="FALSE"
|
||||
|
||||
if [ -d "$TEMP_DIR" ]; then
|
||||
rm -r $TEMP_DIR
|
||||
fi
|
||||
rm -f $INDEX_PAGE
|
||||
rm -f $SEARCH_PAGE
|
||||
rm -f $AGENDA_HTML
|
||||
|
||||
mkdir $TEMP_DIR
|
||||
|
||||
while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g')
|
||||
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
|
||||
INDEX_END="FALSE"
|
||||
while [[ $INDEX_END == "FALSE" ]]; do
|
||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
FOUNDLIST="FALSE"
|
||||
while IFS= read -r LINE; do
|
||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||
if [[ "$GREPENDLIST" == "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: End of list."
|
||||
INDEX_END="TRUE"
|
||||
break
|
||||
else
|
||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||
echo "-========================================================================-"
|
||||
echo "- $MEETING_NAME"
|
||||
|
||||
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
|
||||
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
|
||||
echo "- Corrected to: $MEETING_NAME"
|
||||
fi
|
||||
# Pages start at 1. Ew.
|
||||
x=1
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
||||
#cat "${TEMP_DIR}escribe.json" > debug.json
|
||||
|
||||
y=0
|
||||
i=0
|
||||
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
|
||||
while (true); do
|
||||
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length' )
|
||||
|
||||
if [[ "$NUM_IN_JSON" == "" ]]; then
|
||||
break
|
||||
fi
|
||||
|
||||
# Decrease in the meeting count == we're on the final page.
|
||||
if (( $i >= $NUM_IN_JSON )) && (( 10#$NUM_IN_JSON >= 50)); then
|
||||
((x++))
|
||||
i=0
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
||||
elif (( $i >= 10#$NUM_IN_JSON )); then
|
||||
break
|
||||
fi
|
||||
|
||||
echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
|
||||
|
||||
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
|
||||
# No need to cat the entire file every time.
|
||||
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' > "${TEMP_DIR}escribe_short.json"
|
||||
|
||||
#echo "> Meeting ID"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
|
||||
#echo "> Meeting Attachments"
|
||||
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
|
||||
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
|
||||
|
||||
clear_agenda_url
|
||||
for ((j=0; j<=(( $NUM_ATTACHMENTS - 1 )); j++)); do
|
||||
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
|
||||
done
|
||||
|
||||
# "25 Feb 2026"
|
||||
if [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
|
||||
echo "Alternate date format."
|
||||
conv_date_alt "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
# "Feb 25 2026"
|
||||
elif [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
|
||||
echo "Standard date format."
|
||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
else
|
||||
echo "COULD NOT FIGURE OUT DATE FORMAT!"
|
||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
fi
|
||||
|
||||
INPAST=""
|
||||
if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
|
||||
echo "NAME : $MEETING_NAME"
|
||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
echo "A (H) : $AGENDA_HTML_URL"
|
||||
echo "A (P) : $AGENDA_PDF_URL"
|
||||
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
|
||||
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
|
||||
echo "AF(H) : $AGENDA_FULL_HTML_URL"
|
||||
echo "AF(P) : $AGENDA_FULL_PDF_URL"
|
||||
echo "AC(H) : $AGENDA_COVER_HTML_URL"
|
||||
echo "AC(P) : $AGENDA_COVER_PDF_URL"
|
||||
echo "AP(H) : $AGENDA_POST_HTML_URL"
|
||||
echo "AP(P) : $AGENDA_POST_PDF_URL"
|
||||
echo "M (H) : $MINUTES_HTML_URL"
|
||||
echo "M (P) : $MINUTES_PDF_URL"
|
||||
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
|
||||
echo "AD(H) : $ADDENDUM_HTML_URL"
|
||||
echo "AD(P) : $ADDENDUM_PDF_URL"
|
||||
else
|
||||
echo "Dates are in the past!"
|
||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
INPAST="TRUE"
|
||||
fi
|
||||
|
||||
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
||||
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
|
||||
echo "Abort."
|
||||
break
|
||||
fi
|
||||
|
||||
#echo "> Meeting Video"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
||||
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
||||
|
||||
ERROR="FALSE"
|
||||
ADDENDUM_ERROR="FALSE"
|
||||
echo "Downloading agenda HTML..."
|
||||
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
elif [[ $AGENDA_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
elif [[ $AGENDA_FULL_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
elif [[ $AGENDA_POST_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
else
|
||||
ERROR="TRUE"
|
||||
fi
|
||||
|
||||
if [[ $ADDENDUM_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress
|
||||
else
|
||||
ADDENDUM_ERROR="TRUE"
|
||||
fi
|
||||
|
||||
if [[ "$ERROR" == "FALSE" ]]; then
|
||||
|
||||
mkdir "./$CITY_ARCHIVE_NAME"
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
||||
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
|
||||
fi
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR/"
|
||||
fi
|
||||
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY")
|
||||
if [ ! -d "$MEETING_DIR" ]; then
|
||||
mkdir "$MEETING_DIR/"
|
||||
fi
|
||||
if [ ! -d "$MEETING_DIR/Attachments" ]; then
|
||||
mkdir "$MEETING_DIR/Attachments/"
|
||||
fi
|
||||
|
||||
if [[ $VIDEO_URL != "" ]]; then
|
||||
echo "Saving recording URL..."
|
||||
echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt"
|
||||
fi
|
||||
|
||||
# Get attachment links
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
|
||||
# Get attachment links
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||
fi
|
||||
# Download attachment and use the name grabbed above
|
||||
echo "Found the following agenda attachments:"
|
||||
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
||||
echo "- $LINEA2"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -N -q #--show-progress
|
||||
done < ./tmp/attachment_urls 3< ./tmp/attachment_names
|
||||
echo "All attachments saved."
|
||||
|
||||
download_agendas "$MEETING_DIR"
|
||||
|
||||
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
|
||||
echo "dir not empty" >> /dev/null
|
||||
else
|
||||
rm -r "$MEETING_DIR/Attachments"
|
||||
fi
|
||||
|
||||
echo "All files from this meeting have been saved."
|
||||
fi
|
||||
|
||||
((i++))
|
||||
((y++))
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||
if [[ "$GREPLIST" != "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||
FOUNDLIST="TRUE"
|
||||
fi
|
||||
done < $INDEX_PAGE
|
||||
else
|
||||
INDEX_END="TRUE"
|
||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||
fi
|
||||
done
|
||||
done < websites.csv
|
||||
|
||||
|
||||
98
SCRAPE_OPEN.SH
Normal file
98
SCRAPE_OPEN.SH
Normal file
@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_OPENDATA.SH: Scrape Open Data from the City of London -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- https://gist.github.com/rvtr/******************************** -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
WORKDIR="./tmp"
|
||||
STAGEDIR="./staging"
|
||||
DOCDIR="./LondonArchive_OpenData"
|
||||
MAPDIR="./LondonArchive_OpenData/Maps"
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
mkdir $WORKDIR
|
||||
mkdir $DOCDIR
|
||||
mkdir $MAPDIR
|
||||
|
||||
i=0
|
||||
SEARCH_END=0
|
||||
while [[ $SEARCH_END == 0 ]]; do
|
||||
if ((i == 0)); then
|
||||
OFFSET=""
|
||||
else
|
||||
OFFSET="startindex=$((i * 100))"
|
||||
fi
|
||||
|
||||
echo "Start index download..."
|
||||
curl --get \
|
||||
--data-urlencode "filter=((group IN (de724381536540a5bf2d005fb32ec92a, d17e3e9bfd274e88aeed15fa165bf1e3, b7ab05d332c24dd2ba485acd2ac92837, b15cf62bc0a14990a75e348930b0cb4e)))" \
|
||||
--data-urlencode "limit=100" \
|
||||
--data-urlencode "$OFFSET" \
|
||||
"https://hub.arcgis.com/api/search/v1/collections/all/items" \
|
||||
| jq > $WORKDIR/arcgis_list.json
|
||||
|
||||
TOTAL_ITEMS=$(jq .numberMatched $WORKDIR/arcgis_list.json)
|
||||
RETURNED_ITEMS=$(jq .numberReturned $WORKDIR/arcgis_list.json)
|
||||
echo "Total items in JSON : $TOTAL_ITEMS"
|
||||
echo "Returned items : $RETURNED_ITEMS"
|
||||
|
||||
for (( j=0; j<=$((RETURNED_ITEMS - 1)); j++ )); do
|
||||
ITEM_ID=$(jq .features[$j]\ .id $WORKDIR/arcgis_list.json | sed 's/\"//g')
|
||||
ITEM_TITLE=$(jq .features[$j]\ .properties\ .title $WORKDIR/arcgis_list.json | sed 's/\"//g')
|
||||
ITEM_URL=$(jq .features[$j]\ .properties\ .url $WORKDIR/arcgis_list.json | sed 's/\"//g')
|
||||
ITEM_NAME=$(jq .features[$j]\ .properties\ .name $WORKDIR/arcgis_list.json | sed 's/\"//g')
|
||||
echo "Cur. article: $i.$j, ID : $ITEM_ID"
|
||||
echo " Cur. article: $i.$j, Title: $ITEM_TITLE"
|
||||
echo " Cur. article: $i.$j, URL : $ITEM_URL"
|
||||
echo " Cur. article: $i.$j, Name : $ITEM_NAME"
|
||||
|
||||
rm -rf $STAGEDIR
|
||||
mkdir $STAGEDIR
|
||||
|
||||
if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then
|
||||
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$STAGEDIR/$ITEM_NAME" -c -q
|
||||
echo " Downloaded."
|
||||
|
||||
echo "Compressing."
|
||||
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR"
|
||||
fi
|
||||
|
||||
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]]; then
|
||||
MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')"
|
||||
echo " ^^^ Item is map. ($MAP_ID) "
|
||||
# https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1
|
||||
# KML and GeoJSON use the spatial ID of 4326, all others use 26917
|
||||
MAP_CSV="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=csv&spatialRefId=26917&where=1=1"
|
||||
MAP_SHP="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=shp&spatialRefId=26917&where=1=1"
|
||||
MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1"
|
||||
MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1"
|
||||
echo " Map URL (CSV) : $MAP_CSV"
|
||||
wget --user-agent="$WGET_UA" "$MAP_CSV" -O "$STAGEDIR/$ITEM_TITLE.csv" -c -q
|
||||
echo " Downloaded."
|
||||
echo " Map URL (Shapefile): $MAP_SHP"
|
||||
wget --user-agent="$WGET_UA" "$MAP_SHP" -O "$STAGEDIR/$ITEM_TITLE.shp" -c -q
|
||||
echo " Downloaded."
|
||||
echo " Map URL (GeoJSON) : $MAP_GEO"
|
||||
wget --user-agent="$WGET_UA" "$MAP_GEO" -O "$STAGEDIR/$ITEM_TITLE.geojson" -c -q
|
||||
echo " Downloaded."
|
||||
echo " Map URL (KML) : $MAP_KML"
|
||||
wget --user-agent="$WGET_UA" "$MAP_KML" -O "$STAGEDIR/$ITEM_TITLE.kml" -c -q
|
||||
echo " Downloaded."
|
||||
echo ' Source URL is $ITEM_URL.'
|
||||
|
||||
echo "Compressing."
|
||||
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$MAPDIR/$ITEM_TITLE.7z" "$STAGEDIR"
|
||||
fi
|
||||
done
|
||||
|
||||
if (( ($((i * 100)) + j) >= TOTAL_ITEMS)); then
|
||||
echo "No more items!"
|
||||
SEARCH_END=1
|
||||
break
|
||||
fi
|
||||
((i++))
|
||||
done
|
||||
351
SCRAPE_PLAN.SH
Normal file
351
SCRAPE_PLAN.SH
Normal file
@ -0,0 +1,351 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_PLANAPPS.SH: Downloads planning applications -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
conv_date_plan() {
|
||||
PROJECT_TIME_YEAR=$(echo $1 | sed 's/.*\([0-9]\{4\}\).*/\1/p' | uniq)
|
||||
PROJECT_TIME_MONTH_WORD=$(echo $1 | sed 's/.*,\s*\([A-Za-z]*\)\s[0-9]\{1,2\},.*/\1/p' | uniq)
|
||||
PROJECT_TIME_DAY_SHORT=$(echo $1 | sed 's/.*,\s*[A-Za-z]*\s\([0-9]\{1,2\}\),.*/\1/p' | uniq)
|
||||
PROJECT_TIME_DAY=$(printf "%02d" $PROJECT_TIME_DAY_SHORT)
|
||||
case "$PROJECT_TIME_MONTH_WORD" in
|
||||
Jan*) PROJECT_TIME_MONTH="01" ;;
|
||||
Feb*) PROJECT_TIME_MONTH="02" ;;
|
||||
Mar*) PROJECT_TIME_MONTH="03" ;;
|
||||
Apr*) PROJECT_TIME_MONTH="04" ;;
|
||||
May) PROJECT_TIME_MONTH="05" ;;
|
||||
Jun*) PROJECT_TIME_MONTH="06" ;;
|
||||
Jul*) PROJECT_TIME_MONTH="07" ;;
|
||||
Aug*) PROJECT_TIME_MONTH="08" ;;
|
||||
Sep*) PROJECT_TIME_MONTH="09" ;;
|
||||
Oct*) PROJECT_TIME_MONTH="10" ;;
|
||||
Nov*) PROJECT_TIME_MONTH="11" ;;
|
||||
Dec*) PROJECT_TIME_MONTH="12" ;;
|
||||
*) PROJECT_TIME_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
conv_date() {
|
||||
MODIFIED_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MODIFIED_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MODIFIED_DAY=$(printf "%02d" $MODIFIED_DAY_SHORT)
|
||||
MODIFIED_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
case "$MODIFIED_MONTH_WORD" in
|
||||
Jan*) MODIFIED_MONTH="01" ;;
|
||||
Feb*) MODIFIED_MONTH="02" ;;
|
||||
Mar*) MODIFIED_MONTH="03" ;;
|
||||
Apr*) MODIFIED_MONTH="04" ;;
|
||||
May) MODIFIED_MONTH="05" ;;
|
||||
Jun*) MODIFIED_MONTH="06" ;;
|
||||
Jul*) MODIFIED_MONTH="07" ;;
|
||||
Aug*) MODIFIED_MONTH="08" ;;
|
||||
Sep*) MODIFIED_MONTH="09" ;;
|
||||
Oct*) MODIFIED_MONTH="10" ;;
|
||||
Nov*) MODIFIED_MONTH="11" ;;
|
||||
Dec*) MODIFIED_MONTH="12" ;;
|
||||
*) MODIFIED_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Warning to all who read this script:
|
||||
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
||||
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
TEMP_DIR="./tmp/"
|
||||
SEARCH_PAGE="./tmp/index_pa.html"
|
||||
PROJECT_PAGE="./tmp/work_pa.html"
|
||||
PROJECT_INFO="./tmp/info.txt"
|
||||
PROJECT_ATTACH_NAMES="./tmp/names.txt"
|
||||
PROJECT_ATTACH_URLS="./tmp/urls.txt"
|
||||
PROJECT_IMAGE_NAMES="./tmp/image-names.txt"
|
||||
PROJECT_IMAGE_URLS="./tmp/image-urls.txt"
|
||||
|
||||
current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)
|
||||
|
||||
#if [ -d "$TEMP_DIR" ]; then
|
||||
# rm -r $TEMP_DIR
|
||||
#fi
|
||||
rm -f $SEARCH_PAGE
|
||||
rm -f $PROJECT_PAGE
|
||||
|
||||
mkdir $TEMP_DIR
|
||||
|
||||
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
|
||||
|
||||
j=0
|
||||
SEARCH_END="FALSE"
|
||||
while [[ $SEARCH_END == "FALSE" ]]; do
|
||||
echo "-========================================================================-"
|
||||
echo "Downloading search results... Page $j"
|
||||
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
PAGE_HAS_APPS=$(cat $SEARCH_PAGE | grep "teaser__title")
|
||||
if [[ "$PAGE_HAS_APPS" != "" ]]; then
|
||||
while IFS= read -r LINE; do
|
||||
|
||||
rm -f $PROJECT_INFO
|
||||
PAGE_FOUND_APP=$(echo $LINE | grep "teaser__title")
|
||||
if [[ "$PAGE_FOUND_APP" != "" ]]; then
|
||||
echo "-========================================================================-"
|
||||
echo "Task starting on: $(date)"
|
||||
|
||||
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq)
|
||||
PROJECT_URL=$(echo "https://london.ca"$PROJECT_URL)
|
||||
|
||||
echo "Downloading page..."
|
||||
wget --user-agent="$WGET_UA" $PROJECT_URL -O $PROJECT_PAGE --timestamping -q #--show-progress
|
||||
|
||||
# Removing COVID is due to the naming in the 2020s. Keeping it for revisiting wayback crawls.
|
||||
PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-')
|
||||
echo " Found project: $PROJECT_NAME"
|
||||
|
||||
MODIFIED_MONTH=""
|
||||
MODIFIED_YEAR=""
|
||||
conv_date "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
|
||||
if (( 10#$MODIFIED_YEAR >= 10#$current_year )) && (( 10#$MODIFIED_MONTH >= $((10#$current_month - 1)) )); then
|
||||
echo "Last Modified: $MODIFIED_YEAR/$MODIFIED_MONTH/$MODIFIED_DAY"
|
||||
else
|
||||
echo "Dates are in the past! Abort."
|
||||
break
|
||||
fi
|
||||
echo "Finding attachments..."
|
||||
|
||||
rm -f $PROJECT_ATTACH_URLS
|
||||
rm -f $PROJECT_ATTACH_NAMES
|
||||
rm -f $PROJECT_IMAGE_URLS
|
||||
rm -f $PROJECT_IMAGE_NAMES
|
||||
|
||||
while IFS= read -r PLINE; do
|
||||
if [[ "$NEXT_LINE_FITEM" == "TRUE" ]]; then
|
||||
NEXT_LINE_FITEM="FALSE"
|
||||
|
||||
# Is this line an actual item?
|
||||
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
|
||||
# Is this line bad data (usually scripts)?
|
||||
PROJECT_INFO_IS_BAD=$(echo $PLINE | grep "</script>")
|
||||
# Gotta add in the &s and 's.
|
||||
PROJECT_INFO_ITEM=$(echo $PLINE | sed 's/.*<div class="field__item">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | uniq)
|
||||
if [[ $PROJECT_INFO_IS_ITEMS == "" ]] && [[ $PROJECT_INFO_IS_BAD == "" ]] && [[ $PROJECT_INFO_ITEM != "" ]]; then
|
||||
# We'll check to see if a non-info item made it in. Sometimes attachments will get caught, but can be detected by "visually-hidden"
|
||||
PROJECT_INFO_LABEL_BAD=$(echo $PROJECT_INFO_LABEL | grep "visually-hidden")
|
||||
if [[ $PROJECT_INFO_LABEL_BAD == "" ]]; then
|
||||
printf "%-17s: %s\n" "$PROJECT_INFO_LABEL" "$PROJECT_INFO_ITEM" >> $PROJECT_INFO
|
||||
if [[ $PROJECT_INFO_LABEL == "File Number" ]]; then
|
||||
PROJECT_FILE_NUM_2=""
|
||||
PROJECT_FILE_NUM_TYPE_2=""
|
||||
PROJECT_FILE_NUM_IS_MULTI=""
|
||||
# Multiple file numbers may be listed. We will always use the first one as it is contained in PDF names.
|
||||
# I think it takes priority. Anyways, here are the possible formats:
|
||||
# XX-#####
|
||||
# XX-#####/XX-#####
|
||||
# XX-##### / XX-#####
|
||||
# XX-##### and XX-#####
|
||||
# XX-##### & XX-#####
|
||||
#
|
||||
# I think the city is allergic to standardization...
|
||||
PROJECT_FILE_NUM_IS_MULTI=$(echo $PROJECT_INFO_ITEM | grep -e "and" -e "/" -e "&")
|
||||
PROJECT_FILE_NUM=$(echo $PROJECT_INFO_ITEM | sed 's|/.*||' | sed 's| and .*||' | sed 's| & .*||' | sed 's/^[[:space:]]*//g' | sed 's/[[:space:]]*$//g' | uniq)
|
||||
PROJECT_FILE_NUM_TYPE=$(echo "$PROJECT_FILE_NUM" | sed 's/^\([^-]*\)-.*$/\1/')
|
||||
if [[ "$PROJECT_FILE_NUM_TYPE" == "Line of Sight" ]]; then
|
||||
PROJECT_FILE_TYPE="Line of Sight"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE" == "O" ]]; then
|
||||
PROJECT_FILE_TYPE="Official Plan Amendment"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE" == "Z" ]]; then
|
||||
PROJECT_FILE_TYPE="Zoning By-law Amendment"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE" == "OZ" ]]; then
|
||||
PROJECT_FILE_TYPE="Official Plan and Zoning By-law Amendment"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE" == "TZ" ]]; then
|
||||
PROJECT_FILE_TYPE="Temporary Zoning By-law Amendment"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE" == "39T" ]]; then
|
||||
PROJECT_FILE_TYPE="Draft Plan of Subdivision"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE" == "39CD" ]]; then
|
||||
PROJECT_FILE_TYPE="Draft Plan of Condominium"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE" =~ ^SPA2[0-9]+$ ]]; then
|
||||
PROJECT_FILE_TYPE="Site Plan Control Application"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE" == "M" ]]; then
|
||||
PROJECT_FILE_TYPE="Minor Zoning By-law Amendment"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE" == "H" ]]; then
|
||||
PROJECT_FILE_TYPE="Holding Provision By-law Amendment"
|
||||
else
|
||||
PROJECT_FILE_TYPE="BAD RECORD TYPE"
|
||||
fi
|
||||
echo "Found file# : $PROJECT_FILE_NUM ($PROJECT_FILE_TYPE)"
|
||||
|
||||
if [[ "$PROJECT_FILE_NUM_IS_MULTI" != "" ]]; then
|
||||
# It isn't great, but if a project has 2 file numbers then we'll save it as both.
|
||||
# I'm not sure how to get around this since I don't have a way to tag files.
|
||||
PROJECT_FILE_NUM_2=$(echo $PROJECT_INFO_ITEM | sed 's|.*/||' | sed 's|.* and ||' | sed 's|.* & ||' | sed 's/^[[:space:]]*//g' | sed 's/[[:space:]]*$//g' | uniq)
|
||||
PROJECT_FILE_NUM_TYPE_2=$(echo "$PROJECT_FILE_NUM_2" | sed 's/^\([^-]*\)-.*$/\1/')
|
||||
if [[ "$PROJECT_FILE_NUM_TYPE_2" == "Line of Sight" ]]; then
|
||||
PROJECT_FILE_TYPE_2="Line of Sight"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "O" ]]; then
|
||||
PROJECT_FILE_TYPE_2="Official Plan Amendment"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "Z" ]]; then
|
||||
PROJECT_FILE_TYPE_2="Zoning By-law Amendment"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "OZ" ]]; then
|
||||
PROJECT_FILE_TYPE_2="Official Plan and Zoning By-law Amendment"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "TZ" ]]; then
|
||||
PROJECT_FILE_TYPE_2="Temporary Zoning By-law Amendment"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "39T" ]]; then
|
||||
PROJECT_FILE_TYPE_2="Draft Plan of Subdivision"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE_2" == "39CD" ]]; then
|
||||
PROJECT_FILE_TYPE_2="Draft Plan of Condominium"
|
||||
elif [[ "$PROJECT_FILE_NUM_TYPE_2" =~ ^SPA2[0-9]+$ ]]; then
|
||||
PROJECT_FILE_TYPE_2="Site Plan Control Application"
|
||||
else
|
||||
PROJECT_FILE_TYPE_2="BAD RECORD TYPE"
|
||||
fi
|
||||
echo "Also filed as: $PROJECT_FILE_NUM_2 ($PROJECT_FILE_TYPE_2)"
|
||||
fi
|
||||
fi
|
||||
PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime")
|
||||
if [[ $PROJECT_FOUND_TIME != "" ]]; then
|
||||
conv_date_plan "$PLINE"
|
||||
echo "Found date : $PROJECT_TIME_YEAR/$PROJECT_TIME_MONTH/$PROJECT_TIME_DAY"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$NEXT_LINE_IMAGE" == "TRUE" ]]; then
|
||||
NEXT_LINE_IMAGE="FALSE"
|
||||
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
|
||||
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
|
||||
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
|
||||
PROJECT_IMAGE_URL=$(echo "https://london.ca"$PROJECT_IMAGE_URL)
|
||||
fi
|
||||
PROJECT_IMAGE_NAME=$(echo $PROJECT_IMAGE_URL | sed 's#.*/##p' | uniq)
|
||||
echo $PROJECT_IMAGE_URL >> $PROJECT_IMAGE_URLS
|
||||
echo $PROJECT_IMAGE_NAME >> $PROJECT_IMAGE_NAMES
|
||||
fi
|
||||
|
||||
PROJECT_FOUND_FILE=$(echo $PLINE | grep "file--mime-application-")
|
||||
if [[ $PROJECT_FOUND_FILE != "" ]]; then
|
||||
PROJECT_ATTACH_URL=$(echo $PLINE | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq)
|
||||
# Newer links are relative paths, so we must add back the domain
|
||||
PROJECT_ATTACH_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
|
||||
if [[ $PROJECT_ATTACH_URL_SHORT == "" ]];then
|
||||
PROJECT_ATTACH_URL=$(echo "https://london.ca"$PROJECT_ATTACH_URL)
|
||||
fi
|
||||
PROJECT_ATTACH_NAME=$(echo $PLINE | sed 's/.*title="\([^"]*\)".*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | uniq)
|
||||
echo $PROJECT_ATTACH_URL >> $PROJECT_ATTACH_URLS
|
||||
echo $PROJECT_ATTACH_NAME >> $PROJECT_ATTACH_NAMES
|
||||
fi
|
||||
|
||||
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
|
||||
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
|
||||
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
|
||||
NEXT_LINE_FITEM="TRUE"
|
||||
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
|
||||
# We're setting a flag to let the script know if an upcoming line is contents.
|
||||
fi
|
||||
|
||||
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
|
||||
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
|
||||
NEXT_LINE_IMAGE="TRUE"
|
||||
# Same idea as before but for the image shown on the main page.
|
||||
fi
|
||||
|
||||
PROJECT_FOUND_EMAIL=$(echo $PLINE | grep "field--name-field-email" | sed 's/.*href="\([^"]*\)".*/\1/p' | sed 's|^mailto:||' | uniq)
|
||||
if [[ "$PROJECT_FOUND_EMAIL" != "" ]]; then
|
||||
printf "%-17s: %s\n" "Email" "$PROJECT_FOUND_EMAIL" >> $PROJECT_INFO
|
||||
fi
|
||||
PROJECT_FOUND_PLANNER=$(echo $PLINE | grep "field--name-name" | sed 's/.*<div[^>]*>\([^<]*\)<[\/:-]div>.*/\1/p' | uniq)
|
||||
if [[ "$PROJECT_FOUND_PLANNER" != "" ]]; then
|
||||
printf "\n%-17s: %s\n" "Planner" "$PROJECT_FOUND_PLANNER" >> $PROJECT_INFO
|
||||
fi
|
||||
|
||||
done < $PROJECT_PAGE
|
||||
echo "Filing away all the datas..."
|
||||
|
||||
mkdir "./LondonArchive"
|
||||
mkdir "./LondonArchive/Planning Applications"
|
||||
|
||||
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE" ]; then
|
||||
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/"
|
||||
fi
|
||||
|
||||
|
||||
|
||||
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE == $PROJECT_FILE_TYPE_2 ]]; then
|
||||
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME" ]; then
|
||||
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/"
|
||||
fi
|
||||
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" ]; then
|
||||
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments"
|
||||
fi
|
||||
PROJECT_SAVE_PATH="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM & $PROJECT_FILE_NUM_2 - $PROJECT_NAME"
|
||||
else
|
||||
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME" ]; then
|
||||
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/"
|
||||
fi
|
||||
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/Attachments" ]; then
|
||||
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME/Attachments"
|
||||
fi
|
||||
PROJECT_SAVE_PATH="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE/$PROJECT_FILE_NUM - $PROJECT_NAME"
|
||||
fi
|
||||
|
||||
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
|
||||
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2" ]; then
|
||||
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/"
|
||||
fi
|
||||
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME" ]; then
|
||||
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/"
|
||||
fi
|
||||
if [ ! -d "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments" ]; then
|
||||
mkdir "./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME/Attachments"
|
||||
fi
|
||||
PROJECT_SAVE_PATH_2="./LondonArchive/Planning Applications/$PROJECT_FILE_TYPE_2/$PROJECT_FILE_NUM_2 - $PROJECT_NAME"
|
||||
fi
|
||||
|
||||
echo "Saving attachments:"
|
||||
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
||||
#echo " - $LINEA1"
|
||||
echo " - $LINEA2"
|
||||
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH/Attachments/$LINEA2" --timestamping -q #--show-progress
|
||||
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
|
||||
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH_2/Attachments/$LINEA2" --timestamping -q #--show-progress
|
||||
fi
|
||||
done < $PROJECT_ATTACH_URLS 3< $PROJECT_ATTACH_NAMES
|
||||
echo "All attachments saved."
|
||||
|
||||
if [[ "$PROJECT_IMAGE_URL" != "" ]]; then
|
||||
PROJECT_IMAGE_URL=""
|
||||
echo "Saving photos:"
|
||||
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
||||
#echo " - $LINEA1"
|
||||
echo " - $LINEA2"
|
||||
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH/$LINEA2" --timestamping -q #--show-progress
|
||||
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
|
||||
wget --user-agent="$WGET_UA" "$LINEA1" -O "$PROJECT_SAVE_PATH_2/$LINEA2" --timestamping -q #--show-progress
|
||||
fi
|
||||
done < $PROJECT_IMAGE_URLS 3< $PROJECT_IMAGE_NAMES
|
||||
echo "All photos saved."
|
||||
fi
|
||||
|
||||
echo "Extracted info summary:"
|
||||
cat $PROJECT_INFO > "$PROJECT_SAVE_PATH/Info.txt"
|
||||
if [[ $PROJECT_FILE_NUM_2 != "" ]] && [[ $PROJECT_FILE_TYPE != $PROJECT_FILE_TYPE_2 ]]; then
|
||||
cat $PROJECT_INFO > "$PROJECT_SAVE_PATH_2/Info.txt"
|
||||
fi
|
||||
cat $PROJECT_INFO
|
||||
fi
|
||||
done < $SEARCH_PAGE
|
||||
else
|
||||
SEARCH_END="TRUE"
|
||||
echo "No more pages!"
|
||||
fi
|
||||
else
|
||||
SEARCH_END="TRUE"
|
||||
echo "No more pages!"
|
||||
fi
|
||||
((j++))
|
||||
done
|
||||
Loading…
Reference in New Issue
Block a user