#!/usr/bin/env bash echo -e "\n-========================================================================-" echo -e "-=- -=-" echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-" echo -e "-=- -=-" echo -e "-=- https://gist.github.com/rvtr/******************************** -=-" echo -e "-=- Lillian Skinner -=-" echo -e "-=- -=-" echo -e "-========================================================================-" conv_date() { echo "$1" MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT) MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') case "$MEETING_MONTH_WORD" in Jan*) MEETING_MONTH="01" ;; Feb*) MEETING_MONTH="02" ;; Mar*) MEETING_MONTH="03" ;; Apr*) MEETING_MONTH="04" ;; May) MEETING_MONTH="05" ;; Jun*) MEETING_MONTH="06" ;; Jul*) MEETING_MONTH="07" ;; Aug*) MEETING_MONTH="08" ;; Sep*) MEETING_MONTH="09" ;; Oct*) MEETING_MONTH="10" ;; Nov*) MEETING_MONTH="11" ;; Dec*) MEETING_MONTH="12" ;; *) MEETING_MONTH="--" ;; esac } MEETINGS_PAGE="./tmp.html" # London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" mkdir "./LondonArchive" mkdir "./LondonArchive/LPS" mkdir "./tmp" wget --user-agent="$WGET_UA" "https://londonpoliceserviceboard.com/board-meetings/" -O "./tmp/index.html" -q #--show-progress current_year=$(date +%Y) current_month=$(date +%m) current_day=$(date +%d) # If I don't set these values then "10#: invalid integer constant" MEETING_YEAR="0000" MEETING_MONTH="00" MEETING_DAY="00" while IFS= read -r LINE_PRE; do LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /') FOUND_ATTACH_TYPE="$(echo $LINE | grep '

') if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then conv_date "$(echo $FOUND_LINK | sed 's/.*]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')" echo "$MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY" echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" mkdir -p "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/" wget "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" -O "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/$ATTACH_TYPE.pdf" -q fi done < "./tmp/index.html"