LondonScrapers/SCRAPE_LPS.SH
2026-06-19 23:30:51 -04:00

56 lines
2.8 KiB
Bash
Executable File

#!/bin/bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-"
echo -e "-=- -=-"
echo -e "-=- https://gist.github.com/rvtr/******************************** -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
source ./functions/.functions
MEETINGS_PAGE="./tmp.html"
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
mkdir "./LondonArchive"
mkdir "./LondonArchive/LPS"
mkdir "./tmp"
wget --user-agent="$WGET_UA" "https://londonpoliceserviceboard.com/board-meetings/" -O "./tmp/index.html" -q #--show-progress
current_year=$(date +%Y)
current_month=$(date +%m)
current_day=$(date +%d)
# If I don't set these values then "10#: invalid integer constant"
ITEM_YEAR="0000"
ITEM_MONTH="00"
ITEM_DAY="00"
while IFS= read -r LINE_PRE; do
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
FOUND_ATTACH_TYPE="$(echo $LINE | grep '<h3 class="fusion-title-heading' | sed 's/.*<h3[^>]*>\([^<]*\)<[\/:-]h3>.*/\1/')"
FOUND_YEAR_HEADER="$(echo $LINE | grep 'tabindex="0" aria-labelledby="fusion-tab-' | sed 's/.*aria-labelledby="\([^"]*\)".*/\1/' | sed 's/.*fusion-tab-//')"
if [[ "$FOUND_YEAR_HEADER" != "" ]]; then
echo $FOUND_ATTACH_TYPE
echo $FOUND_YEAR_HEADER
YEAR="$FOUND_YEAR_HEADER"
if [[ "$FOUND_ATTACH_TYPE" == "Meeting Minutes" ]]; then
ATTACH_TYPE="Minutes"
elif [[ "$FOUND_ATTACH_TYPE" == "Agenda and Report Packages" ]]; then
ATTACH_TYPE="Agenda"
fi
fi
FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">')
if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then
_time_parse_helper "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
echo "$ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')"
mkdir -p "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/"
_utils_download_helper "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/$ATTACH_TYPE.pdf"
fi
done < "./tmp/index.html"