diff --git a/SCRAPE_MEET.SH b/SCRAPE_MEET.SH
index 6dd8f04..45317de 100644
--- a/SCRAPE_MEET.SH
+++ b/SCRAPE_MEET.SH
@@ -8,25 +8,25 @@ echo -e "-=-
echo -e "-========================================================================-"
conv_date() {
- echo "$1"
+ echo "$1"
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
case "$MEETING_MONTH_WORD" in
- Jan*) MEETING_MONTH="01" ;;
- Feb*) MEETING_MONTH="02" ;;
- Mar*) MEETING_MONTH="03" ;;
- Apr*) MEETING_MONTH="04" ;;
- May) MEETING_MONTH="05" ;;
- Jun*) MEETING_MONTH="06" ;;
- Jul*) MEETING_MONTH="07" ;;
- Aug*) MEETING_MONTH="08" ;;
- Sep*) MEETING_MONTH="09" ;;
- Oct*) MEETING_MONTH="10" ;;
- Nov*) MEETING_MONTH="11" ;;
- Dec*) MEETING_MONTH="12" ;;
- *) MEETING_MONTH="--" ;;
+ Jan*) MEETING_MONTH="01" ;;
+ Feb*) MEETING_MONTH="02" ;;
+ Mar*) MEETING_MONTH="03" ;;
+ Apr*) MEETING_MONTH="04" ;;
+ May) MEETING_MONTH="05" ;;
+ Jun*) MEETING_MONTH="06" ;;
+ Jul*) MEETING_MONTH="07" ;;
+ Aug*) MEETING_MONTH="08" ;;
+ Sep*) MEETING_MONTH="09" ;;
+ Oct*) MEETING_MONTH="10" ;;
+ Nov*) MEETING_MONTH="11" ;;
+ Dec*) MEETING_MONTH="12" ;;
+ *) MEETING_MONTH="--" ;;
esac
}
@@ -54,130 +54,156 @@ conv_date_alt() {
}
set_agenda_url() {
- case "$1" in
- '"Agenda (HTML)"')
- AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Agenda (PDF)"')
- AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Revised Agenda (HTML)"')
- AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Revised Agenda (PDF)"')
- AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Minutes (HTML)"')
- MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Minutes (PDF)"')
- MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Minutes with Attachments (PDF)"')
- MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
+ case "$1" in
+ '"Agenda (HTML)"')
+ AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Agenda (PDF)"')
+ AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Revised Agenda (HTML)"')
+ AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Revised Agenda (PDF)"')
+ AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Minutes (HTML)"')
+ MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Minutes (PDF)"')
+ MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Minutes with Attachments (PDF)"')
+ MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Agenda Full Package (HTML)"')
- AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Agenda Full Package (PDF)"')
- AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Agenda Cover Page (HTML)"')
- AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Agenda Cover Page (PDF)"')
- AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Post Agenda (HTML)"')
- AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Post Agenda (PDF)"')
- AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Addendum (HTML)"')
- ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
- '"Addendum (PDF)"')
- ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
- esac
+ '"Agenda Full Package (HTML)"')
+ AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Agenda Full Package (PDF)"')
+ AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Agenda Cover Page (HTML)"')
+ AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Agenda Cover Page (PDF)"')
+ AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Post Agenda (HTML)"')
+ AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Post Agenda (PDF)"')
+ AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Addendum (HTML)"')
+ ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
+ '"Addendum (PDF)"')
+ ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
+ esac
}
clear_agenda_url() {
- AGENDA_HTML_URL=""
- AGENDA_PDF_URL=""
- AGENDA_REVISE_HTML_URL=""
- AGENDA_REVISE_PDF_URL=""
- MINUTES_HTML_URL=""
- MINUTES_PDF_URL=""
- MINUTES_ATTACH_PDF_URL=""
+ AGENDA_HTML_URL=""
+ AGENDA_PDF_URL=""
+ AGENDA_REVISE_HTML_URL=""
+ AGENDA_REVISE_PDF_URL=""
+ MINUTES_HTML_URL=""
+ MINUTES_PDF_URL=""
+ MINUTES_ATTACH_PDF_URL=""
- AGENDA_FULL_HTML_URL=""
- AGENDA_FULL_PDF_URL=""
- AGENDA_COVER_HTML_URL=""
- AGENDA_COVER_PDF_URL=""
- AGENDA_POST_HTML_URL=""
- AGENDA_POST_PDF_URL=""
- ADDENDUM_HTML_URL=""
- ADDENDUM_PDF_URL=""
+ AGENDA_FULL_HTML_URL=""
+ AGENDA_FULL_PDF_URL=""
+ AGENDA_COVER_HTML_URL=""
+ AGENDA_COVER_PDF_URL=""
+ AGENDA_POST_HTML_URL=""
+ AGENDA_POST_PDF_URL=""
+ ADDENDUM_HTML_URL=""
+ ADDENDUM_PDF_URL=""
+}
+
+download_helper() {
+ local url="$1"
+ local out="$2"
+ local code
+
+ code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
+ case "$code" in
+ 200)
+ echo "Downloaded."
+ ;;
+ 304)
+ echo "Already exists! Skipping."
+ ;;
+ *)
+ echo "FAILED! $code: $out | $url" >&2
+ return 1
+ ;;
+ esac
}
download_agendas() {
- if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then
- if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then
- echo "Saving revised agenda as PDF..."
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$1/Agenda_Revised.pdf" -N -q #--show-progress
- fi
- if [[ $AGENDA_PDF_URL != "" ]]; then
- echo "Saving regular agenda as PDF..."
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$1/Agenda.pdf" -N -q #--show-progress
- fi
- elif [[ $AGENDA_REVISE_HTML_URL != "" ]] || [[ $AGENDA_HTML_URL != "" ]]; then
- if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
- echo "Saving revised agenda as HTML... (no PDF found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$1/Agenda_Revised.html" -N -q #--show-progress
- fi
- if [[ $AGENDA_HTML_URL != "" ]]; then
- echo "Saving regular agenda as HTML... (no PDF found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$1/Agenda.html" -N -q #--show-progress
- fi
- elif [[ $AGENDA_FULL_PDF_URL != "" ]] || [[ $AGENDA_FULL_HTML_URL != "" ]]; then
- if [[ $AGENDA_FULL_PDF_URL != "" ]]; then
- echo "Saving full package agenda as PDF... (no HTML found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_PDF_URL" -O "$1/Agenda_FullPackage.pdf" -N -q #--show-progress
- fi
- if [[ $AGENDA_FULL_HTML_URL != "" ]]; then
- echo "Saving full package agenda as HTML... (no PDF found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O "$1/Agenda_FullPackage.html" -N -q #--show-progress
- fi
- elif [[ $AGENDA_POST_PDF_URL != "" ]] || [[ $AGENDA_POST_HTML_URL != "" ]]; then
- if [[ $AGENDA_POST_PDF_URL != "" ]]; then
- echo "Saving post agenda as HTML... (no HTML found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_PDF_URL" -O "$1/Agenda_Post.pdf" -N -q #--show-progress
- fi
- if [[ $AGENDA_POST_HTML_URL != "" ]]; then
- echo "Saving post agenda as HTML... (no PDF found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O "$1/Agenda_Post.html" -N -q #--show-progress
- fi
+ local outdir="$1"
+
+ if [[ -n $AGENDA_REVISE_PDF_URL ]]; then
+ echo "Saving revised agenda as PDF..."
+ download_helper "$AGENDA_REVISE_PDF_URL" "$outdir/Agenda_Revised.pdf"
fi
- if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then
- if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then
- echo "Saving minutes with attachments as PDF..."
- wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$1/Minutes_With_Attachments.pdf" -N -q #--show-progress
- fi
- if [[ $MINUTES_PDF_URL != "" ]]; then
- echo "Saving minutes as PDF..."
- wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$1/Minutes.pdf" -N -q #--show-progress
- fi
- else
- if [[ $MINUTES_HTML_URL != "" ]]; then
- echo "Saving minutes as HTML... (no PDF found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$1/Minutes.html" -N -q #--show-progress
- fi
- fi
+ if [[ -n $AGENDA_PDF_URL ]]; then
+ echo "Saving regular agenda as PDF..."
+ download_helper "$AGENDA_PDF_URL" "$outdir/Agenda.pdf"
+ fi
- if [[ $AGENDA_COVER_PDF_URL != "" ]]; then
- echo "Saving cover agenda as PDF... (no HTML found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_PDF_URL" -O "$1/Agenda_Cover.pdf" -N -q #--show-progress
+ if [[ -z $AGENDA_REVISE_PDF_URL && -n $AGENDA_REVISE_HTML_URL ]]; then
+ echo "Saving revised agenda as HTML... (no PDF found!)"
+ download_helper "$AGENDA_REVISE_HTML_URL" "$outdir/Agenda_Revised.html"
fi
- if [[ $AGENDA_COVER_HTML_URL != "" ]]; then
- echo "Saving cover agenda as HTML... (no PDF found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O "$1/Agenda_Cover.html" -N -q #--show-progress
+
+ if [[ -z $AGENDA_PDF_URL && -n $AGENDA_HTML_URL ]]; then
+ echo "Saving regular agenda as HTML... (no PDF found!)"
+ download_helper "$AGENDA_HTML_URL" "$outdir/Agenda.html"
fi
- if [[ $ADDENDUM_PDF_URL != "" ]]; then
- echo "Saving addendum as PDF... (no HTML found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_PDF_URL" -O "$1/Addendum.pdf" -N -q #--show-progress
+
+ if [[ -n $AGENDA_FULL_PDF_URL ]]; then
+ echo "Saving full package agenda as PDF... (no HTML found!)"
+ download_helper "$AGENDA_FULL_PDF_URL" "$outdir/Agenda_FullPackage.pdf"
fi
- if [[ $ADDENDUM_HTML_URL != "" ]]; then
- echo "Saving addendum as HTML... (no PDF found!)"
- wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O "$1/Addendum.html" -N -q #--show-progress
+
+ if [[ -z $AGENDA_FULL_PDF_URL && -n $AGENDA_FULL_HTML_URL ]]; then
+ echo "Saving full package agenda as HTML... (no PDF found!)"
+ download_helper "$AGENDA_FULL_HTML_URL" "$outdir/Agenda_FullPackage.html"
+ fi
+
+ if [[ -n $AGENDA_POST_PDF_URL ]]; then
+ echo "Saving post agenda as PDF..."
+ download_helper "$AGENDA_POST_PDF_URL" "$outdir/Agenda_Post.pdf"
+ fi
+
+ if [[ -z $AGENDA_POST_PDF_URL && -n $AGENDA_POST_HTML_URL ]]; then
+ echo "Saving post agenda as HTML... (no PDF found!)"
+ download_helper "$AGENDA_POST_HTML_URL" "$outdir/Agenda_Post.html"
+ fi
+
+ if [[ -n $MINUTES_ATTACH_PDF_URL ]]; then
+ echo "Saving minutes with attachments as PDF..."
+ download_helper "$MINUTES_ATTACH_PDF_URL" "$outdir/Minutes_With_Attachments.pdf"
+ fi
+
+ if [[ -n $MINUTES_PDF_URL ]]; then
+ echo "Saving minutes as PDF..."
+ download_helper "$MINUTES_PDF_URL" "$outdir/Minutes.pdf"
+ fi
+
+ if [[ -z $MINUTES_ATTACH_PDF_URL && -z $MINUTES_PDF_URL && -n $MINUTES_HTML_URL ]]; then
+ echo "Saving minutes as HTML... (no PDF found!)"
+ download_helper "$MINUTES_HTML_URL" "$outdir/Minutes.html"
+ fi
+
+ if [[ -n $AGENDA_COVER_PDF_URL ]]; then
+ echo "Saving cover agenda as PDF... (no HTML found!)"
+ download_helper "$AGENDA_COVER_PDF_URL" "$outdir/Agenda_Cover.pdf"
+ fi
+
+ if [[ -z $AGENDA_COVER_PDF_URL && -n $AGENDA_COVER_HTML_URL ]]; then
+ echo "Saving cover agenda as HTML... (no PDF found!)"
+ download_helper "$AGENDA_COVER_HTML_URL" "$outdir/Agenda_Cover.html"
+ fi
+
+ if [[ -n $ADDENDUM_PDF_URL ]]; then
+ echo "Saving addendum as PDF... (no HTML found!)"
+ download_helper "$ADDENDUM_PDF_URL" "$outdir/Addendum.pdf"
+ fi
+
+ if [[ -z $ADDENDUM_PDF_URL && -n $ADDENDUM_HTML_URL ]]; then
+ echo "Saving addendum as HTML... (no PDF found!)"
+ download_helper "$ADDENDUM_HTML_URL" "$outdir/Addendum.html"
fi
}
@@ -196,9 +222,9 @@ ADDENDUM_HTML="./tmp/addendum.html"
current_year=$(date +%Y)
current_month=$(date +%m)
-current_day=$(date +%d)00
+current_day=$(date +%d)
-SUPPORT_PAST=1
+SUPPORT_PAST=""
if [ -d "$TEMP_DIR" ]; then
rm -r $TEMP_DIR
@@ -209,215 +235,226 @@ rm -f $AGENDA_HTML
mkdir $TEMP_DIR
-while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
- INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g')
- CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
- CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
+while IFS="," read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
+ INDEX_URL=$(echo "$INDEX_URL_PRE" | sed 's/\"//g' | sed 's/,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
+ CITY_ARCHIVE_NAME=$(echo "$CITY_ARCHIVE_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
+ CALENDAR_NAME=$(echo "$CALENDAR_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
- INDEX_END=0
- while (( ! INDEX_END )); do
- echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
- wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
- if [ $? -ne 8 ]; then
- FOUNDLIST=0
- while IFS= read -r LINE; do
- if (( FOUNDLIST )); then
- GREPENDLIST=$(echo $LINE | grep '