Scrape_CivicWeb-FilePro/SCRAPE_FILEPRO.SH

66 lines
2.1 KiB
Bash

#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_FILEPRO.SH: Downloads all FilePro (CivicWeb) documents -=-"
echo -e "-=- -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
download_helper() {
local url="$1"
local out="$2"
local code
code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
case "$code" in
200)
echo "Downloaded."
;;
304)
echo "Already exists! Skipping."
;;
*)
echo "FAILED! $code: $out | $url" >&2
return 1
;;
esac
}
download_folder() {
local tmp_index
tmp_index=$(mktemp)
local tmp_dir
tmp_dir="$1"
local LINE
local LINE_ID
local LINE_TITLE
local LINE_TYPE
wget --no-check-certificate --user-agent="$WGET_UA" "$2" -O "$tmp_index" --no-hsts -q
echo "Looking in folder $3/$LINE_ID"
echo "Download to $tmp_dir/"
while IFS= read -r LINE; do
LINE_ID=$(echo $LINE | sed 's/.*data-id="\([^"]*\)".*/\1/g')
LINE_TITLE=$(echo $LINE | sed 's/.*data-title="\([^"]*\)".*/\1/g' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/'/'\''/g')
LINE_TYPE=$(echo $LINE | sed 's/.*data-type="\([^"]*\)".*/\1/g')
if [[ "$LINE_TYPE" == "document" ]]; then
echo "Found document: $LINE_ID : $LINE_TITLE.pdf... downloading..."
mkdir -p "$tmp_dir"
download_helper "${START_URL}/document/$LINE_ID" "$tmp_dir/$LINE_TITLE.pdf"
elif [[ "$LINE_TYPE" == "folder" ]]; then
download_folder "$tmp_dir/$LINE_TITLE" "${START_URL}/filepro/documents/$LINE_ID" "$3/$LINE_ID"
fi
done < "$tmp_index"
rm -f $tmp_index
}
START_URL="https://aylmer.civicweb.net"
while (true); do
download_folder "./FilePro_Dump" "${START_URL}/filepro/documents" "0"
break
done