Update SCRAPE_MEET.SH
This commit is contained in:
parent
f694de0674
commit
3a76f2f5af
661
SCRAPE_MEET.SH
661
SCRAPE_MEET.SH
@ -8,25 +8,25 @@ echo -e "-=-
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
conv_date() {
|
||||
echo "$1"
|
||||
echo "$1"
|
||||
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
||||
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
case "$MEETING_MONTH_WORD" in
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
@ -54,130 +54,156 @@ conv_date_alt() {
|
||||
}
|
||||
|
||||
set_agenda_url() {
|
||||
case "$1" in
|
||||
'"Agenda (HTML)"')
|
||||
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda (PDF)"')
|
||||
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (HTML)"')
|
||||
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (PDF)"')
|
||||
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (HTML)"')
|
||||
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (PDF)"')
|
||||
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes with Attachments (PDF)"')
|
||||
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
case "$1" in
|
||||
'"Agenda (HTML)"')
|
||||
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda (PDF)"')
|
||||
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (HTML)"')
|
||||
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (PDF)"')
|
||||
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (HTML)"')
|
||||
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (PDF)"')
|
||||
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes with Attachments (PDF)"')
|
||||
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
|
||||
'"Agenda Full Package (HTML)"')
|
||||
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Full Package (PDF)"')
|
||||
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (HTML)"')
|
||||
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (PDF)"')
|
||||
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (HTML)"')
|
||||
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (PDF)"')
|
||||
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (HTML)"')
|
||||
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (PDF)"')
|
||||
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
esac
|
||||
'"Agenda Full Package (HTML)"')
|
||||
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Full Package (PDF)"')
|
||||
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (HTML)"')
|
||||
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (PDF)"')
|
||||
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (HTML)"')
|
||||
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (PDF)"')
|
||||
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (HTML)"')
|
||||
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (PDF)"')
|
||||
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
esac
|
||||
}
|
||||
|
||||
clear_agenda_url() {
|
||||
AGENDA_HTML_URL=""
|
||||
AGENDA_PDF_URL=""
|
||||
AGENDA_REVISE_HTML_URL=""
|
||||
AGENDA_REVISE_PDF_URL=""
|
||||
MINUTES_HTML_URL=""
|
||||
MINUTES_PDF_URL=""
|
||||
MINUTES_ATTACH_PDF_URL=""
|
||||
AGENDA_HTML_URL=""
|
||||
AGENDA_PDF_URL=""
|
||||
AGENDA_REVISE_HTML_URL=""
|
||||
AGENDA_REVISE_PDF_URL=""
|
||||
MINUTES_HTML_URL=""
|
||||
MINUTES_PDF_URL=""
|
||||
MINUTES_ATTACH_PDF_URL=""
|
||||
|
||||
AGENDA_FULL_HTML_URL=""
|
||||
AGENDA_FULL_PDF_URL=""
|
||||
AGENDA_COVER_HTML_URL=""
|
||||
AGENDA_COVER_PDF_URL=""
|
||||
AGENDA_POST_HTML_URL=""
|
||||
AGENDA_POST_PDF_URL=""
|
||||
ADDENDUM_HTML_URL=""
|
||||
ADDENDUM_PDF_URL=""
|
||||
AGENDA_FULL_HTML_URL=""
|
||||
AGENDA_FULL_PDF_URL=""
|
||||
AGENDA_COVER_HTML_URL=""
|
||||
AGENDA_COVER_PDF_URL=""
|
||||
AGENDA_POST_HTML_URL=""
|
||||
AGENDA_POST_PDF_URL=""
|
||||
ADDENDUM_HTML_URL=""
|
||||
ADDENDUM_PDF_URL=""
|
||||
}
|
||||
|
||||
download_helper() {
|
||||
local url="$1"
|
||||
local out="$2"
|
||||
local code
|
||||
|
||||
code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
|
||||
case "$code" in
|
||||
200)
|
||||
echo "Downloaded."
|
||||
;;
|
||||
304)
|
||||
echo "Already exists! Skipping."
|
||||
;;
|
||||
*)
|
||||
echo "FAILED! $code: $out | $url" >&2
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
download_agendas() {
|
||||
if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then
|
||||
if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then
|
||||
echo "Saving revised agenda as PDF..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$1/Agenda_Revised.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $AGENDA_PDF_URL != "" ]]; then
|
||||
echo "Saving regular agenda as PDF..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$1/Agenda.pdf" -N -q #--show-progress
|
||||
fi
|
||||
elif [[ $AGENDA_REVISE_HTML_URL != "" ]] || [[ $AGENDA_HTML_URL != "" ]]; then
|
||||
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
|
||||
echo "Saving revised agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$1/Agenda_Revised.html" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $AGENDA_HTML_URL != "" ]]; then
|
||||
echo "Saving regular agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$1/Agenda.html" -N -q #--show-progress
|
||||
fi
|
||||
elif [[ $AGENDA_FULL_PDF_URL != "" ]] || [[ $AGENDA_FULL_HTML_URL != "" ]]; then
|
||||
if [[ $AGENDA_FULL_PDF_URL != "" ]]; then
|
||||
echo "Saving full package agenda as PDF... (no HTML found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_PDF_URL" -O "$1/Agenda_FullPackage.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $AGENDA_FULL_HTML_URL != "" ]]; then
|
||||
echo "Saving full package agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O "$1/Agenda_FullPackage.html" -N -q #--show-progress
|
||||
fi
|
||||
elif [[ $AGENDA_POST_PDF_URL != "" ]] || [[ $AGENDA_POST_HTML_URL != "" ]]; then
|
||||
if [[ $AGENDA_POST_PDF_URL != "" ]]; then
|
||||
echo "Saving post agenda as HTML... (no HTML found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_PDF_URL" -O "$1/Agenda_Post.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $AGENDA_POST_HTML_URL != "" ]]; then
|
||||
echo "Saving post agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O "$1/Agenda_Post.html" -N -q #--show-progress
|
||||
fi
|
||||
local outdir="$1"
|
||||
|
||||
if [[ -n $AGENDA_REVISE_PDF_URL ]]; then
|
||||
echo "Saving revised agenda as PDF..."
|
||||
download_helper "$AGENDA_REVISE_PDF_URL" "$outdir/Agenda_Revised.pdf"
|
||||
fi
|
||||
|
||||
if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then
|
||||
if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then
|
||||
echo "Saving minutes with attachments as PDF..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$1/Minutes_With_Attachments.pdf" -N -q #--show-progress
|
||||
fi
|
||||
if [[ $MINUTES_PDF_URL != "" ]]; then
|
||||
echo "Saving minutes as PDF..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$1/Minutes.pdf" -N -q #--show-progress
|
||||
fi
|
||||
else
|
||||
if [[ $MINUTES_HTML_URL != "" ]]; then
|
||||
echo "Saving minutes as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$1/Minutes.html" -N -q #--show-progress
|
||||
fi
|
||||
fi
|
||||
if [[ -n $AGENDA_PDF_URL ]]; then
|
||||
echo "Saving regular agenda as PDF..."
|
||||
download_helper "$AGENDA_PDF_URL" "$outdir/Agenda.pdf"
|
||||
fi
|
||||
|
||||
if [[ $AGENDA_COVER_PDF_URL != "" ]]; then
|
||||
echo "Saving cover agenda as PDF... (no HTML found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_PDF_URL" -O "$1/Agenda_Cover.pdf" -N -q #--show-progress
|
||||
if [[ -z $AGENDA_REVISE_PDF_URL && -n $AGENDA_REVISE_HTML_URL ]]; then
|
||||
echo "Saving revised agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_REVISE_HTML_URL" "$outdir/Agenda_Revised.html"
|
||||
fi
|
||||
if [[ $AGENDA_COVER_HTML_URL != "" ]]; then
|
||||
echo "Saving cover agenda as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O "$1/Agenda_Cover.html" -N -q #--show-progress
|
||||
|
||||
if [[ -z $AGENDA_PDF_URL && -n $AGENDA_HTML_URL ]]; then
|
||||
echo "Saving regular agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_HTML_URL" "$outdir/Agenda.html"
|
||||
fi
|
||||
if [[ $ADDENDUM_PDF_URL != "" ]]; then
|
||||
echo "Saving addendum as PDF... (no HTML found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_PDF_URL" -O "$1/Addendum.pdf" -N -q #--show-progress
|
||||
|
||||
if [[ -n $AGENDA_FULL_PDF_URL ]]; then
|
||||
echo "Saving full package agenda as PDF... (no HTML found!)"
|
||||
download_helper "$AGENDA_FULL_PDF_URL" "$outdir/Agenda_FullPackage.pdf"
|
||||
fi
|
||||
if [[ $ADDENDUM_HTML_URL != "" ]]; then
|
||||
echo "Saving addendum as HTML... (no PDF found!)"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O "$1/Addendum.html" -N -q #--show-progress
|
||||
|
||||
if [[ -z $AGENDA_FULL_PDF_URL && -n $AGENDA_FULL_HTML_URL ]]; then
|
||||
echo "Saving full package agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_FULL_HTML_URL" "$outdir/Agenda_FullPackage.html"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_POST_PDF_URL ]]; then
|
||||
echo "Saving post agenda as PDF..."
|
||||
download_helper "$AGENDA_POST_PDF_URL" "$outdir/Agenda_Post.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_POST_PDF_URL && -n $AGENDA_POST_HTML_URL ]]; then
|
||||
echo "Saving post agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_POST_HTML_URL" "$outdir/Agenda_Post.html"
|
||||
fi
|
||||
|
||||
if [[ -n $MINUTES_ATTACH_PDF_URL ]]; then
|
||||
echo "Saving minutes with attachments as PDF..."
|
||||
download_helper "$MINUTES_ATTACH_PDF_URL" "$outdir/Minutes_With_Attachments.pdf"
|
||||
fi
|
||||
|
||||
if [[ -n $MINUTES_PDF_URL ]]; then
|
||||
echo "Saving minutes as PDF..."
|
||||
download_helper "$MINUTES_PDF_URL" "$outdir/Minutes.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $MINUTES_ATTACH_PDF_URL && -z $MINUTES_PDF_URL && -n $MINUTES_HTML_URL ]]; then
|
||||
echo "Saving minutes as HTML... (no PDF found!)"
|
||||
download_helper "$MINUTES_HTML_URL" "$outdir/Minutes.html"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_COVER_PDF_URL ]]; then
|
||||
echo "Saving cover agenda as PDF... (no HTML found!)"
|
||||
download_helper "$AGENDA_COVER_PDF_URL" "$outdir/Agenda_Cover.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_COVER_PDF_URL && -n $AGENDA_COVER_HTML_URL ]]; then
|
||||
echo "Saving cover agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_COVER_HTML_URL" "$outdir/Agenda_Cover.html"
|
||||
fi
|
||||
|
||||
if [[ -n $ADDENDUM_PDF_URL ]]; then
|
||||
echo "Saving addendum as PDF... (no HTML found!)"
|
||||
download_helper "$ADDENDUM_PDF_URL" "$outdir/Addendum.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $ADDENDUM_PDF_URL && -n $ADDENDUM_HTML_URL ]]; then
|
||||
echo "Saving addendum as HTML... (no PDF found!)"
|
||||
download_helper "$ADDENDUM_HTML_URL" "$outdir/Addendum.html"
|
||||
fi
|
||||
}
|
||||
|
||||
@ -196,9 +222,9 @@ ADDENDUM_HTML="./tmp/addendum.html"
|
||||
|
||||
current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)00
|
||||
current_day=$(date +%d)
|
||||
|
||||
SUPPORT_PAST=1
|
||||
SUPPORT_PAST=""
|
||||
|
||||
if [ -d "$TEMP_DIR" ]; then
|
||||
rm -r $TEMP_DIR
|
||||
@ -209,215 +235,226 @@ rm -f $AGENDA_HTML
|
||||
|
||||
mkdir $TEMP_DIR
|
||||
|
||||
while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g')
|
||||
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||
while IFS="," read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
INDEX_URL=$(echo "$INDEX_URL_PRE" | sed 's/\"//g' | sed 's/,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||
CITY_ARCHIVE_NAME=$(echo "$CITY_ARCHIVE_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||
CALENDAR_NAME=$(echo "$CALENDAR_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||
|
||||
INDEX_END=0
|
||||
while (( ! INDEX_END )); do
|
||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
FOUNDLIST=0
|
||||
while IFS= read -r LINE; do
|
||||
if (( FOUNDLIST )); then
|
||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||
if [[ "$GREPENDLIST" == "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: End of list."
|
||||
INDEX_END=1
|
||||
break
|
||||
else
|
||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||
echo "-========================================================================-"
|
||||
echo "- $MEETING_NAME"
|
||||
INDEX_END="FALSE"
|
||||
while [[ $INDEX_END == "FALSE" ]]; do
|
||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --no-hsts --show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
FOUNDLIST="FALSE"
|
||||
while IFS= read -r LINE; do
|
||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||
if [[ "$GREPENDLIST" == "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: End of list."
|
||||
INDEX_END="TRUE"
|
||||
break
|
||||
else
|
||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||
echo "-========================================================================-"
|
||||
echo "- $MEETING_NAME"
|
||||
|
||||
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
|
||||
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
|
||||
echo "- Corrected to: $MEETING_NAME"
|
||||
fi
|
||||
# Pages start at 1. Ew.
|
||||
x=1
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
||||
#cat "${TEMP_DIR}escribe.json" > debug.json
|
||||
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
|
||||
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
|
||||
echo "- Corrected to: $MEETING_NAME"
|
||||
fi
|
||||
# Pages start at 1. Ew.
|
||||
x=1
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
||||
#cat "${TEMP_DIR}escribe.json" > debug.json
|
||||
|
||||
y=0
|
||||
i=0
|
||||
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
|
||||
while (true); do
|
||||
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length' )
|
||||
y=0
|
||||
i=0
|
||||
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
|
||||
while (true); do
|
||||
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length' )
|
||||
|
||||
if [[ "$NUM_IN_JSON" == "" ]]; then
|
||||
break
|
||||
fi
|
||||
if [[ "$NUM_IN_JSON" == "" ]]; then
|
||||
break
|
||||
fi
|
||||
|
||||
# Decrease in the meeting count == we're on the final page.
|
||||
if (( $i >= $NUM_IN_JSON )) && (( 10#$NUM_IN_JSON >= 50)); then
|
||||
((x++))
|
||||
i=0
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
||||
elif (( $i >= 10#$NUM_IN_JSON )); then
|
||||
break
|
||||
fi
|
||||
# Decrease in the meeting count == we're on the final page.
|
||||
if (( $i >= $NUM_IN_JSON )) && (( 10#$NUM_IN_JSON >= 50)); then
|
||||
((x++))
|
||||
i=0
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
||||
elif (( $i >= 10#$NUM_IN_JSON )); then
|
||||
break
|
||||
fi
|
||||
|
||||
echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
|
||||
echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
|
||||
|
||||
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
|
||||
# No need to cat the entire file every time.
|
||||
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' > "${TEMP_DIR}escribe_short.json"
|
||||
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
|
||||
# No need to cat the entire file every time.
|
||||
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' > "${TEMP_DIR}escribe_short.json"
|
||||
|
||||
#echo "> Meeting ID"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
|
||||
#echo "> Meeting Attachments"
|
||||
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
|
||||
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
|
||||
#echo "> Meeting ID"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
|
||||
#echo "> Meeting Attachments"
|
||||
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
|
||||
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
|
||||
|
||||
clear_agenda_url
|
||||
for ((j=0; j<=(( $NUM_ATTACHMENTS - 1 )); j++)); do
|
||||
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
|
||||
done
|
||||
clear_agenda_url
|
||||
for ((j=0; j<=(( $NUM_ATTACHMENTS - 1 )); j++)); do
|
||||
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
|
||||
done
|
||||
|
||||
# "25 Feb 2026"
|
||||
if [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
|
||||
echo "Alternate date format."
|
||||
conv_date_alt "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
# "Feb 25 2026"
|
||||
elif [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
|
||||
echo "Standard date format."
|
||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
else
|
||||
echo "COULD NOT FIGURE OUT DATE FORMAT!"
|
||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
fi
|
||||
# "25 Feb 2026"
|
||||
if [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
|
||||
echo "Alternate date format."
|
||||
conv_date_alt "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
# "Feb 25 2026"
|
||||
elif [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
|
||||
echo "Standard date format."
|
||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
else
|
||||
echo "COULD NOT FIGURE OUT DATE FORMAT!"
|
||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
fi
|
||||
|
||||
INPAST=""
|
||||
if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
|
||||
echo "NAME : $MEETING_NAME"
|
||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
echo "A (H) : $AGENDA_HTML_URL"
|
||||
echo "A (P) : $AGENDA_PDF_URL"
|
||||
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
|
||||
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
|
||||
echo "AF(H) : $AGENDA_FULL_HTML_URL"
|
||||
echo "AF(P) : $AGENDA_FULL_PDF_URL"
|
||||
echo "AC(H) : $AGENDA_COVER_HTML_URL"
|
||||
echo "AC(P) : $AGENDA_COVER_PDF_URL"
|
||||
echo "AP(H) : $AGENDA_POST_HTML_URL"
|
||||
echo "AP(P) : $AGENDA_POST_PDF_URL"
|
||||
echo "M (H) : $MINUTES_HTML_URL"
|
||||
echo "M (P) : $MINUTES_PDF_URL"
|
||||
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
|
||||
echo "AD(H) : $ADDENDUM_HTML_URL"
|
||||
echo "AD(P) : $ADDENDUM_PDF_URL"
|
||||
else
|
||||
echo "Dates are in the past!"
|
||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
INPAST=1
|
||||
fi
|
||||
INPAST=""
|
||||
if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
|
||||
echo "NAME : $MEETING_NAME"
|
||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
echo "A (H) : $AGENDA_HTML_URL"
|
||||
echo "A (P) : $AGENDA_PDF_URL"
|
||||
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
|
||||
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
|
||||
echo "AF(H) : $AGENDA_FULL_HTML_URL"
|
||||
echo "AF(P) : $AGENDA_FULL_PDF_URL"
|
||||
echo "AC(H) : $AGENDA_COVER_HTML_URL"
|
||||
echo "AC(P) : $AGENDA_COVER_PDF_URL"
|
||||
echo "AP(H) : $AGENDA_POST_HTML_URL"
|
||||
echo "AP(P) : $AGENDA_POST_PDF_URL"
|
||||
echo "M (H) : $MINUTES_HTML_URL"
|
||||
echo "M (P) : $MINUTES_PDF_URL"
|
||||
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
|
||||
echo "AD(H) : $ADDENDUM_HTML_URL"
|
||||
echo "AD(P) : $ADDENDUM_PDF_URL"
|
||||
else
|
||||
echo "Dates are in the past!"
|
||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
INPAST="TRUE"
|
||||
fi
|
||||
|
||||
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
||||
if (( INPAST )) && (( ! SUPPORT_PAST )); then
|
||||
echo "Abort."
|
||||
break
|
||||
fi
|
||||
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
||||
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
|
||||
echo "Abort."
|
||||
break
|
||||
fi
|
||||
|
||||
#echo "> Meeting Video"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
||||
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
||||
#echo "> Meeting Video"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
||||
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
||||
|
||||
ERROR=0
|
||||
ADDENDUM_ERROR=0
|
||||
echo "Downloading agenda HTML..."
|
||||
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
elif [[ $AGENDA_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
elif [[ $AGENDA_FULL_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
elif [[ $AGENDA_POST_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||
else
|
||||
ERROR=1
|
||||
fi
|
||||
ERROR="FALSE"
|
||||
ADDENDUM_ERROR="FALSE"
|
||||
echo "Downloading agenda HTML..."
|
||||
|
||||
if [[ $ADDENDUM_HTML_URL != "" ]]; then
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress
|
||||
else
|
||||
ADDENDUM_ERROR=1
|
||||
fi
|
||||
if [[ -n $AGENDA_REVISE_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_REVISE_HTML_URL" "$AGENDA_HTML"
|
||||
|
||||
if (( ! ERROR )); then
|
||||
elif [[ -n $AGENDA_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_HTML_URL" "$AGENDA_HTML"
|
||||
|
||||
mkdir "./$CITY_ARCHIVE_NAME"
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
||||
elif [[ -n $AGENDA_FULL_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_FULL_HTML_URL" "$AGENDA_HTML"
|
||||
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
|
||||
fi
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR/"
|
||||
fi
|
||||
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY")
|
||||
if [ ! -d "$MEETING_DIR" ]; then
|
||||
mkdir "$MEETING_DIR/"
|
||||
fi
|
||||
if [ ! -d "$MEETING_DIR/Attachments" ]; then
|
||||
mkdir "$MEETING_DIR/Attachments/"
|
||||
fi
|
||||
elif [[ -n $AGENDA_POST_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_POST_HTML_URL" "$AGENDA_HTML"
|
||||
|
||||
if [[ $VIDEO_URL != "" ]]; then
|
||||
echo "Saving recording URL..."
|
||||
echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt"
|
||||
fi
|
||||
elif [[ -n $AGENDA_COVER_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_COVER_HTML_URL" "$AGENDA_HTML"
|
||||
|
||||
# Get attachment links
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||
if (( ! ADDENDUM_ERROR )); then
|
||||
# Get attachment links
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||
fi
|
||||
# Download attachment and use the name grabbed above
|
||||
echo "Found the following agenda attachments:"
|
||||
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
||||
echo "- $LINEA2"
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -N -q #--show-progress
|
||||
done < ./tmp/attachment_urls 3< ./tmp/attachment_names
|
||||
echo "All attachments saved."
|
||||
else
|
||||
ERROR="TRUE"
|
||||
fi
|
||||
|
||||
download_agendas "$MEETING_DIR"
|
||||
|
||||
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
|
||||
echo "dir not empty" >> /dev/null
|
||||
else
|
||||
rm -r "$MEETING_DIR/Attachments"
|
||||
fi
|
||||
if [[ -n $ADDENDUM_HTML_URL ]]; then
|
||||
download_helper "$ADDENDUM_HTML_URL" "$ADDENDUM_HTML"
|
||||
else
|
||||
ADDENDUM_ERROR="TRUE"
|
||||
fi
|
||||
|
||||
echo "All files from this meeting have been saved."
|
||||
fi
|
||||
if [[ "$ERROR" == "FALSE" ]]; then
|
||||
|
||||
((i++))
|
||||
((y++))
|
||||
done
|
||||
fi
|
||||
fi
|
||||
mkdir "./$CITY_ARCHIVE_NAME"
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
||||
|
||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||
if [[ "$GREPLIST" != "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||
FOUNDLIST=1
|
||||
fi
|
||||
done < $INDEX_PAGE
|
||||
else
|
||||
INDEX_END=1
|
||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||
fi
|
||||
done
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
|
||||
fi
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR/"
|
||||
fi
|
||||
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY")
|
||||
if [ ! -d "$MEETING_DIR" ]; then
|
||||
mkdir "$MEETING_DIR/"
|
||||
fi
|
||||
if [ ! -d "$MEETING_DIR/Attachments" ]; then
|
||||
mkdir "$MEETING_DIR/Attachments/"
|
||||
fi
|
||||
|
||||
if [[ $VIDEO_URL != "" ]]; then
|
||||
echo "Saving recording URL..."
|
||||
echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt"
|
||||
fi
|
||||
|
||||
# Get attachment links
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
|
||||
# Get attachment links
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||
fi
|
||||
# Download attachment and use the name grabbed above
|
||||
echo "Found the following agenda attachments:"
|
||||
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
||||
echo "- $LINEA2 / $LINEA1"
|
||||
download_helper "$INDEX_URL$LINEA1" "$MEETING_DIR/Attachments/$LINEA2"
|
||||
# [ ! -s "$MEETING_DIR/Attachments/$LINEA2" ] && rm -f "$MEETING_DIR/Attachments/$LINEA2"
|
||||
done < ./tmp/attachment_urls 3< ./tmp/attachment_names
|
||||
echo "All attachments saved."
|
||||
|
||||
download_agendas "$MEETING_DIR"
|
||||
|
||||
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
|
||||
echo "dir not empty" >> /dev/null
|
||||
else
|
||||
rm -r "$MEETING_DIR/Attachments"
|
||||
fi
|
||||
|
||||
echo "All files from this meeting have been saved."
|
||||
find "$MEETING_DIR" -type f -size 0 -delete
|
||||
echo "Cleaning PDFs for archive.org..."
|
||||
find "$MEETING_DIR" -type f -name '*.pdf' -print0 | xargs -0 -n1 qpdf --replace-input
|
||||
# qpdf repairs and leaves garbage original PDFs
|
||||
find "$MEETING_DIR" -type f -name '*~qpdf-orig' -delete -print
|
||||
fi
|
||||
|
||||
((i++))
|
||||
((y++))
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||
if [[ "$GREPLIST" != "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||
FOUNDLIST="TRUE"
|
||||
fi
|
||||
done < $INDEX_PAGE
|
||||
else
|
||||
INDEX_END="TRUE"
|
||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||
fi
|
||||
done
|
||||
done < websites.csv
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user