LondonScrapers/SCRAPE_OPEN.SH

99 lines
4.4 KiB
Bash

#!/usr/bin/env bash
echo -e "\n-========================================================================-"
echo -e "-=- -=-"
echo -e "-=- SCRAPE_OPENDATA.SH: Scrape Open Data from the City of London -=-"
echo -e "-=- -=-"
echo -e "-=- https://gist.github.com/rvtr/******************************** -=-"
echo -e "-=- Lillian Skinner -=-"
echo -e "-=- -=-"
echo -e "-========================================================================-"
WORKDIR="./tmp"
STAGEDIR="./staging"
DOCDIR="./LondonArchive_OpenData"
MAPDIR="./LondonArchive_OpenData/Maps"
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
mkdir $WORKDIR
mkdir $DOCDIR
mkdir $MAPDIR
i=0
SEARCH_END=0
while [[ $SEARCH_END == 0 ]]; do
if ((i == 0)); then
OFFSET=""
else
OFFSET="startindex=$((i * 100))"
fi
echo "Start index download..."
curl --get \
--data-urlencode "filter=((group IN (de724381536540a5bf2d005fb32ec92a, d17e3e9bfd274e88aeed15fa165bf1e3, b7ab05d332c24dd2ba485acd2ac92837, b15cf62bc0a14990a75e348930b0cb4e)))" \
--data-urlencode "limit=100" \
--data-urlencode "$OFFSET" \
"https://hub.arcgis.com/api/search/v1/collections/all/items" \
| jq > $WORKDIR/arcgis_list.json
TOTAL_ITEMS=$(jq .numberMatched $WORKDIR/arcgis_list.json)
RETURNED_ITEMS=$(jq .numberReturned $WORKDIR/arcgis_list.json)
echo "Total items in JSON : $TOTAL_ITEMS"
echo "Returned items : $RETURNED_ITEMS"
for (( j=0; j<=$((RETURNED_ITEMS - 1)); j++ )); do
ITEM_ID=$(jq .features[$j]\ .id $WORKDIR/arcgis_list.json | sed 's/\"//g')
ITEM_TITLE=$(jq .features[$j]\ .properties\ .title $WORKDIR/arcgis_list.json | sed 's/\"//g')
ITEM_URL=$(jq .features[$j]\ .properties\ .url $WORKDIR/arcgis_list.json | sed 's/\"//g')
ITEM_NAME=$(jq .features[$j]\ .properties\ .name $WORKDIR/arcgis_list.json | sed 's/\"//g')
echo "Cur. article: $i.$j, ID : $ITEM_ID"
echo " Cur. article: $i.$j, Title: $ITEM_TITLE"
echo " Cur. article: $i.$j, URL : $ITEM_URL"
echo " Cur. article: $i.$j, Name : $ITEM_NAME"
rm -rf $STAGEDIR
mkdir $STAGEDIR
if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$STAGEDIR/$ITEM_NAME" -c -q
echo " Downloaded."
echo "Compressing."
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR"
fi
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]]; then
MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')"
echo " ^^^ Item is map. ($MAP_ID) "
# https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1
# KML and GeoJSON use the spatial ID of 4326, all others use 26917
MAP_CSV="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=csv&spatialRefId=26917&where=1=1"
MAP_SHP="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=shp&spatialRefId=26917&where=1=1"
MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1"
MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1"
echo " Map URL (CSV) : $MAP_CSV"
wget --user-agent="$WGET_UA" "$MAP_CSV" -O "$STAGEDIR/$ITEM_TITLE.csv" -c -q
echo " Downloaded."
echo " Map URL (Shapefile): $MAP_SHP"
wget --user-agent="$WGET_UA" "$MAP_SHP" -O "$STAGEDIR/$ITEM_TITLE.shp" -c -q
echo " Downloaded."
echo " Map URL (GeoJSON) : $MAP_GEO"
wget --user-agent="$WGET_UA" "$MAP_GEO" -O "$STAGEDIR/$ITEM_TITLE.geojson" -c -q
echo " Downloaded."
echo " Map URL (KML) : $MAP_KML"
wget --user-agent="$WGET_UA" "$MAP_KML" -O "$STAGEDIR/$ITEM_TITLE.kml" -c -q
echo " Downloaded."
echo ' Source URL is $ITEM_URL.'
echo "Compressing."
7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$MAPDIR/$ITEM_TITLE.7z" "$STAGEDIR"
fi
done
if (( ($((i * 100)) + j) >= TOTAL_ITEMS)); then
echo "No more items!"
SEARCH_END=1
break
fi
((i++))
done