diff --git a/SCRAPE_MEET.SH b/SCRAPE_MEET.SH index 6dd8f04..45317de 100644 --- a/SCRAPE_MEET.SH +++ b/SCRAPE_MEET.SH @@ -8,25 +8,25 @@ echo -e "-=- echo -e "-========================================================================-" conv_date() { - echo "$1" + echo "$1" MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT) MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') case "$MEETING_MONTH_WORD" in - Jan*) MEETING_MONTH="01" ;; - Feb*) MEETING_MONTH="02" ;; - Mar*) MEETING_MONTH="03" ;; - Apr*) MEETING_MONTH="04" ;; - May) MEETING_MONTH="05" ;; - Jun*) MEETING_MONTH="06" ;; - Jul*) MEETING_MONTH="07" ;; - Aug*) MEETING_MONTH="08" ;; - Sep*) MEETING_MONTH="09" ;; - Oct*) MEETING_MONTH="10" ;; - Nov*) MEETING_MONTH="11" ;; - Dec*) MEETING_MONTH="12" ;; - *) MEETING_MONTH="--" ;; + Jan*) MEETING_MONTH="01" ;; + Feb*) MEETING_MONTH="02" ;; + Mar*) MEETING_MONTH="03" ;; + Apr*) MEETING_MONTH="04" ;; + May) MEETING_MONTH="05" ;; + Jun*) MEETING_MONTH="06" ;; + Jul*) MEETING_MONTH="07" ;; + Aug*) MEETING_MONTH="08" ;; + Sep*) MEETING_MONTH="09" ;; + Oct*) MEETING_MONTH="10" ;; + Nov*) MEETING_MONTH="11" ;; + Dec*) MEETING_MONTH="12" ;; + *) MEETING_MONTH="--" ;; esac } @@ -54,130 +54,156 @@ conv_date_alt() { } set_agenda_url() { - case "$1" in - '"Agenda (HTML)"') - AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;; - '"Agenda (PDF)"') - AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;; - '"Revised Agenda (HTML)"') - AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;; - '"Revised Agenda (PDF)"') - AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;; - '"Minutes (HTML)"') - MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;; - '"Minutes (PDF)"') - MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;; - '"Minutes with Attachments (PDF)"') - MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;; + case "$1" in + '"Agenda (HTML)"') + AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;; + '"Agenda (PDF)"') + AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;; + '"Revised Agenda (HTML)"') + AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;; + '"Revised Agenda (PDF)"') + AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;; + '"Minutes (HTML)"') + MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;; + '"Minutes (PDF)"') + MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;; + '"Minutes with Attachments (PDF)"') + MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;; - '"Agenda Full Package (HTML)"') - AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;; - '"Agenda Full Package (PDF)"') - AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;; - '"Agenda Cover Page (HTML)"') - AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;; - '"Agenda Cover Page (PDF)"') - AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;; - '"Post Agenda (HTML)"') - AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;; - '"Post Agenda (PDF)"') - AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;; - '"Addendum (HTML)"') - ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;; - '"Addendum (PDF)"') - ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;; - esac + '"Agenda Full Package (HTML)"') + AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;; + '"Agenda Full Package (PDF)"') + AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;; + '"Agenda Cover Page (HTML)"') + AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;; + '"Agenda Cover Page (PDF)"') + AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;; + '"Post Agenda (HTML)"') + AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;; + '"Post Agenda (PDF)"') + AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;; + '"Addendum (HTML)"') + ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;; + '"Addendum (PDF)"') + ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;; + esac } clear_agenda_url() { - AGENDA_HTML_URL="" - AGENDA_PDF_URL="" - AGENDA_REVISE_HTML_URL="" - AGENDA_REVISE_PDF_URL="" - MINUTES_HTML_URL="" - MINUTES_PDF_URL="" - MINUTES_ATTACH_PDF_URL="" + AGENDA_HTML_URL="" + AGENDA_PDF_URL="" + AGENDA_REVISE_HTML_URL="" + AGENDA_REVISE_PDF_URL="" + MINUTES_HTML_URL="" + MINUTES_PDF_URL="" + MINUTES_ATTACH_PDF_URL="" - AGENDA_FULL_HTML_URL="" - AGENDA_FULL_PDF_URL="" - AGENDA_COVER_HTML_URL="" - AGENDA_COVER_PDF_URL="" - AGENDA_POST_HTML_URL="" - AGENDA_POST_PDF_URL="" - ADDENDUM_HTML_URL="" - ADDENDUM_PDF_URL="" + AGENDA_FULL_HTML_URL="" + AGENDA_FULL_PDF_URL="" + AGENDA_COVER_HTML_URL="" + AGENDA_COVER_PDF_URL="" + AGENDA_POST_HTML_URL="" + AGENDA_POST_PDF_URL="" + ADDENDUM_HTML_URL="" + ADDENDUM_PDF_URL="" +} + +download_helper() { + local url="$1" + local out="$2" + local code + + code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url") + case "$code" in + 200) + echo "Downloaded." + ;; + 304) + echo "Already exists! Skipping." + ;; + *) + echo "FAILED! $code: $out | $url" >&2 + return 1 + ;; + esac } download_agendas() { - if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then - if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then - echo "Saving revised agenda as PDF..." - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$1/Agenda_Revised.pdf" -N -q #--show-progress - fi - if [[ $AGENDA_PDF_URL != "" ]]; then - echo "Saving regular agenda as PDF..." - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$1/Agenda.pdf" -N -q #--show-progress - fi - elif [[ $AGENDA_REVISE_HTML_URL != "" ]] || [[ $AGENDA_HTML_URL != "" ]]; then - if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then - echo "Saving revised agenda as HTML... (no PDF found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$1/Agenda_Revised.html" -N -q #--show-progress - fi - if [[ $AGENDA_HTML_URL != "" ]]; then - echo "Saving regular agenda as HTML... (no PDF found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$1/Agenda.html" -N -q #--show-progress - fi - elif [[ $AGENDA_FULL_PDF_URL != "" ]] || [[ $AGENDA_FULL_HTML_URL != "" ]]; then - if [[ $AGENDA_FULL_PDF_URL != "" ]]; then - echo "Saving full package agenda as PDF... (no HTML found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_PDF_URL" -O "$1/Agenda_FullPackage.pdf" -N -q #--show-progress - fi - if [[ $AGENDA_FULL_HTML_URL != "" ]]; then - echo "Saving full package agenda as HTML... (no PDF found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_FULL_HTML_URL" -O "$1/Agenda_FullPackage.html" -N -q #--show-progress - fi - elif [[ $AGENDA_POST_PDF_URL != "" ]] || [[ $AGENDA_POST_HTML_URL != "" ]]; then - if [[ $AGENDA_POST_PDF_URL != "" ]]; then - echo "Saving post agenda as HTML... (no HTML found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_PDF_URL" -O "$1/Agenda_Post.pdf" -N -q #--show-progress - fi - if [[ $AGENDA_POST_HTML_URL != "" ]]; then - echo "Saving post agenda as HTML... (no PDF found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_POST_HTML_URL" -O "$1/Agenda_Post.html" -N -q #--show-progress - fi + local outdir="$1" + + if [[ -n $AGENDA_REVISE_PDF_URL ]]; then + echo "Saving revised agenda as PDF..." + download_helper "$AGENDA_REVISE_PDF_URL" "$outdir/Agenda_Revised.pdf" fi - if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then - if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then - echo "Saving minutes with attachments as PDF..." - wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$1/Minutes_With_Attachments.pdf" -N -q #--show-progress - fi - if [[ $MINUTES_PDF_URL != "" ]]; then - echo "Saving minutes as PDF..." - wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$1/Minutes.pdf" -N -q #--show-progress - fi - else - if [[ $MINUTES_HTML_URL != "" ]]; then - echo "Saving minutes as HTML... (no PDF found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$1/Minutes.html" -N -q #--show-progress - fi - fi + if [[ -n $AGENDA_PDF_URL ]]; then + echo "Saving regular agenda as PDF..." + download_helper "$AGENDA_PDF_URL" "$outdir/Agenda.pdf" + fi - if [[ $AGENDA_COVER_PDF_URL != "" ]]; then - echo "Saving cover agenda as PDF... (no HTML found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_PDF_URL" -O "$1/Agenda_Cover.pdf" -N -q #--show-progress + if [[ -z $AGENDA_REVISE_PDF_URL && -n $AGENDA_REVISE_HTML_URL ]]; then + echo "Saving revised agenda as HTML... (no PDF found!)" + download_helper "$AGENDA_REVISE_HTML_URL" "$outdir/Agenda_Revised.html" fi - if [[ $AGENDA_COVER_HTML_URL != "" ]]; then - echo "Saving cover agenda as HTML... (no PDF found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O "$1/Agenda_Cover.html" -N -q #--show-progress + + if [[ -z $AGENDA_PDF_URL && -n $AGENDA_HTML_URL ]]; then + echo "Saving regular agenda as HTML... (no PDF found!)" + download_helper "$AGENDA_HTML_URL" "$outdir/Agenda.html" fi - if [[ $ADDENDUM_PDF_URL != "" ]]; then - echo "Saving addendum as PDF... (no HTML found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_PDF_URL" -O "$1/Addendum.pdf" -N -q #--show-progress + + if [[ -n $AGENDA_FULL_PDF_URL ]]; then + echo "Saving full package agenda as PDF... (no HTML found!)" + download_helper "$AGENDA_FULL_PDF_URL" "$outdir/Agenda_FullPackage.pdf" fi - if [[ $ADDENDUM_HTML_URL != "" ]]; then - echo "Saving addendum as HTML... (no PDF found!)" - wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O "$1/Addendum.html" -N -q #--show-progress + + if [[ -z $AGENDA_FULL_PDF_URL && -n $AGENDA_FULL_HTML_URL ]]; then + echo "Saving full package agenda as HTML... (no PDF found!)" + download_helper "$AGENDA_FULL_HTML_URL" "$outdir/Agenda_FullPackage.html" + fi + + if [[ -n $AGENDA_POST_PDF_URL ]]; then + echo "Saving post agenda as PDF..." + download_helper "$AGENDA_POST_PDF_URL" "$outdir/Agenda_Post.pdf" + fi + + if [[ -z $AGENDA_POST_PDF_URL && -n $AGENDA_POST_HTML_URL ]]; then + echo "Saving post agenda as HTML... (no PDF found!)" + download_helper "$AGENDA_POST_HTML_URL" "$outdir/Agenda_Post.html" + fi + + if [[ -n $MINUTES_ATTACH_PDF_URL ]]; then + echo "Saving minutes with attachments as PDF..." + download_helper "$MINUTES_ATTACH_PDF_URL" "$outdir/Minutes_With_Attachments.pdf" + fi + + if [[ -n $MINUTES_PDF_URL ]]; then + echo "Saving minutes as PDF..." + download_helper "$MINUTES_PDF_URL" "$outdir/Minutes.pdf" + fi + + if [[ -z $MINUTES_ATTACH_PDF_URL && -z $MINUTES_PDF_URL && -n $MINUTES_HTML_URL ]]; then + echo "Saving minutes as HTML... (no PDF found!)" + download_helper "$MINUTES_HTML_URL" "$outdir/Minutes.html" + fi + + if [[ -n $AGENDA_COVER_PDF_URL ]]; then + echo "Saving cover agenda as PDF... (no HTML found!)" + download_helper "$AGENDA_COVER_PDF_URL" "$outdir/Agenda_Cover.pdf" + fi + + if [[ -z $AGENDA_COVER_PDF_URL && -n $AGENDA_COVER_HTML_URL ]]; then + echo "Saving cover agenda as HTML... (no PDF found!)" + download_helper "$AGENDA_COVER_HTML_URL" "$outdir/Agenda_Cover.html" + fi + + if [[ -n $ADDENDUM_PDF_URL ]]; then + echo "Saving addendum as PDF... (no HTML found!)" + download_helper "$ADDENDUM_PDF_URL" "$outdir/Addendum.pdf" + fi + + if [[ -z $ADDENDUM_PDF_URL && -n $ADDENDUM_HTML_URL ]]; then + echo "Saving addendum as HTML... (no PDF found!)" + download_helper "$ADDENDUM_HTML_URL" "$outdir/Addendum.html" fi } @@ -196,9 +222,9 @@ ADDENDUM_HTML="./tmp/addendum.html" current_year=$(date +%Y) current_month=$(date +%m) -current_day=$(date +%d)00 +current_day=$(date +%d) -SUPPORT_PAST=1 +SUPPORT_PAST="" if [ -d "$TEMP_DIR" ]; then rm -r $TEMP_DIR @@ -209,215 +235,226 @@ rm -f $AGENDA_HTML mkdir $TEMP_DIR -while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do - INDEX_URL=$(echo $INDEX_URL_PRE | sed 's/\"//g' | sed 's/,//g') - CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') - CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g') +while IFS="," read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do + INDEX_URL=$(echo "$INDEX_URL_PRE" | sed 's/\"//g' | sed 's/,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//') + CITY_ARCHIVE_NAME=$(echo "$CITY_ARCHIVE_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//') + CALENDAR_NAME=$(echo "$CALENDAR_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//') - INDEX_END=0 - while (( ! INDEX_END )); do - echo "SCRAPE_ESCRIBE: Downloading eScribe index..." - wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress - if [ $? -ne 8 ]; then - FOUNDLIST=0 - while IFS= read -r LINE; do - if (( FOUNDLIST )); then - GREPENDLIST=$(echo $LINE | grep '