100 lines
4.1 KiB
Bash
100 lines
4.1 KiB
Bash
#!/usr/bin/env bash
|
|
echo -e "\n-========================================================================-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- SCRAPE_ESCRIBE.SH: Download eScribe meetings JSONs -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- Lillian Skinner -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-========================================================================-"
|
|
|
|
# Warning to all who read this script:
|
|
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
|
|
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
|
|
|
TEMP_DIR="./tmp/"
|
|
INDEX_PAGE="./tmp/index_cal.html"
|
|
SEARCH_PAGE="./tmp/search.html"
|
|
AGENDA_HTML="./tmp/work.html"
|
|
ADDENDUM_HTML="./tmp/addendum.html"
|
|
#VIDEO_TIMESTAMP_JSON="./tmp/time_cal.json"
|
|
|
|
current_year=$(date +%Y)
|
|
current_month=$(date +%m)
|
|
current_day=$(date +%d)
|
|
|
|
if [ -d "$TEMP_DIR" ]; then
|
|
rm -r $TEMP_DIR
|
|
fi
|
|
rm -f $INDEX_PAGE
|
|
rm -f $SEARCH_PAGE
|
|
rm -f $AGENDA_HTML
|
|
|
|
mkdir $TEMP_DIR
|
|
|
|
while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|
INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g')
|
|
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
|
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
|
|
|
INDEX_END=0
|
|
while (( ! INDEX_END )); do
|
|
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
|
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
|
if [ $? -ne 8 ]; then
|
|
FOUNDLIST=0
|
|
while IFS= read -r LINE; do
|
|
if (( FOUNDLIST )); then
|
|
GREPENDLIST=$(echo $LINE | grep '<option ')
|
|
if [[ "$GREPENDLIST" == "" ]]; then
|
|
echo "SCRAPE_ESCRIBE: End of list."
|
|
INDEX_END=1
|
|
break
|
|
else
|
|
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
|
echo "-========================================================================-"
|
|
echo "- "$MEETING_NAME
|
|
# Pages start at 1. Ew.
|
|
x=1
|
|
|
|
mkdir "${CITY_ARCHIVE_NAME}"
|
|
mkdir "${CITY_ARCHIVE_NAME}/Meetings (JSON)/"
|
|
mkdir "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}"
|
|
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json"
|
|
y=0
|
|
i=0
|
|
NUM_MEETINGS=$(cat "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json" | jq '.d.TotalCount')
|
|
while (true); do
|
|
NUM_IN_JSON=$(cat "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json" | jq '.d.Meetings | length' )
|
|
|
|
# Decrease in the meeting count == we're on the final page.
|
|
if (( $i >= $NUM_IN_JSON )) && (( $NUM_IN_JSON >= 50)); then
|
|
((x++))
|
|
i=0
|
|
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${CITY_ARCHIVE_NAME}/Meetings (JSON)/${MEETING_NAME}/${MEETING_NAME}_$(( $x - 1 )).json"
|
|
elif (( $i >= $NUM_IN_JSON )); then
|
|
break
|
|
fi
|
|
|
|
#echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
|
|
|
|
((i++))
|
|
((y++))
|
|
done
|
|
fi
|
|
fi
|
|
|
|
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
|
if [[ "$GREPLIST" != "" ]]; then
|
|
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
|
FOUNDLIST=1
|
|
fi
|
|
done < $INDEX_PAGE
|
|
else
|
|
INDEX_END=1
|
|
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
|
fi
|
|
done
|
|
done < websites.csv
|