77 lines
3.6 KiB
Bash
77 lines
3.6 KiB
Bash
#!/usr/bin/env bash
|
|
echo -e "\n-========================================================================-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-=- https://gist.github.com/rvtr/******************************** -=-"
|
|
echo -e "-=- Lillian Skinner -=-"
|
|
echo -e "-=- -=-"
|
|
echo -e "-========================================================================-"
|
|
|
|
conv_date() {
|
|
echo "$1"
|
|
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
|
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
|
case "$MEETING_MONTH_WORD" in
|
|
Jan*) MEETING_MONTH="01" ;;
|
|
Feb*) MEETING_MONTH="02" ;;
|
|
Mar*) MEETING_MONTH="03" ;;
|
|
Apr*) MEETING_MONTH="04" ;;
|
|
May) MEETING_MONTH="05" ;;
|
|
Jun*) MEETING_MONTH="06" ;;
|
|
Jul*) MEETING_MONTH="07" ;;
|
|
Aug*) MEETING_MONTH="08" ;;
|
|
Sep*) MEETING_MONTH="09" ;;
|
|
Oct*) MEETING_MONTH="10" ;;
|
|
Nov*) MEETING_MONTH="11" ;;
|
|
Dec*) MEETING_MONTH="12" ;;
|
|
*) MEETING_MONTH="--" ;;
|
|
esac
|
|
}
|
|
|
|
MEETINGS_PAGE="./tmp.html"
|
|
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
|
|
|
mkdir "./LondonArchive"
|
|
mkdir "./LondonArchive/LPS"
|
|
mkdir "./tmp"
|
|
|
|
wget --user-agent="$WGET_UA" "https://londonpoliceserviceboard.com/board-meetings/" -O "./tmp/index.html" -q #--show-progress
|
|
|
|
current_year=$(date +%Y)
|
|
current_month=$(date +%m)
|
|
current_day=$(date +%d)
|
|
# If I don't set these values then "10#: invalid integer constant"
|
|
MEETING_YEAR="0000"
|
|
MEETING_MONTH="00"
|
|
MEETING_DAY="00"
|
|
|
|
while IFS= read -r LINE_PRE; do
|
|
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
|
FOUND_ATTACH_TYPE="$(echo $LINE | grep '<h3 class="fusion-title-heading' | sed 's/.*<h3[^>]*>\([^<]*\)<[\/:-]h3>.*/\1/')"
|
|
FOUND_YEAR_HEADER="$(echo $LINE | grep 'tabindex="0" aria-labelledby="fusion-tab-' | sed 's/.*aria-labelledby="\([^"]*\)".*/\1/' | sed 's/.*fusion-tab-//')"
|
|
if [[ "$FOUND_YEAR_HEADER" != "" ]]; then
|
|
echo $FOUND_ATTACH_TYPE
|
|
echo $FOUND_YEAR_HEADER
|
|
YEAR="$FOUND_YEAR_HEADER"
|
|
if [[ "$FOUND_ATTACH_TYPE" == "Meeting Minutes" ]]; then
|
|
ATTACH_TYPE="Minutes"
|
|
elif [[ "$FOUND_ATTACH_TYPE" == "Agenda and Report Packages" ]]; then
|
|
ATTACH_TYPE="Agenda"
|
|
fi
|
|
fi
|
|
|
|
FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">')
|
|
if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then
|
|
conv_date "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
|
|
echo "$MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
|
echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')"
|
|
mkdir -p "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/"
|
|
wget "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" -O "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/$ATTACH_TYPE.pdf" -q
|
|
fi
|
|
|
|
done < "./tmp/index.html"
|