#!/usr/bin/env bash echo -e "\n-========================================================================-" echo -e "-=- -=-" echo -e "-=- SCRAPE_OPENDATA.SH: Scrape Open Data from the City of London -=-" echo -e "-=- -=-" echo -e "-=- https://gist.github.com/rvtr/******************************** -=-" echo -e "-=- Lillian Skinner -=-" echo -e "-=- -=-" echo -e "-========================================================================-" WORKDIR="./tmp" STAGEDIR="./staging" DOCDIR="./LondonArchive_OpenData" MAPDIR="./LondonArchive_OpenData/Maps" WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" mkdir $WORKDIR mkdir $DOCDIR mkdir $MAPDIR i=0 SEARCH_END=0 while [[ $SEARCH_END == 0 ]]; do if ((i == 0)); then OFFSET="" else OFFSET="startindex=$((i * 100))" fi echo "Start index download..." curl --get \ --data-urlencode "filter=((group IN (de724381536540a5bf2d005fb32ec92a, d17e3e9bfd274e88aeed15fa165bf1e3, b7ab05d332c24dd2ba485acd2ac92837, b15cf62bc0a14990a75e348930b0cb4e)))" \ --data-urlencode "limit=100" \ --data-urlencode "$OFFSET" \ "https://hub.arcgis.com/api/search/v1/collections/all/items" \ | jq > $WORKDIR/arcgis_list.json TOTAL_ITEMS=$(jq .numberMatched $WORKDIR/arcgis_list.json) RETURNED_ITEMS=$(jq .numberReturned $WORKDIR/arcgis_list.json) echo "Total items in JSON : $TOTAL_ITEMS" echo "Returned items : $RETURNED_ITEMS" for (( j=0; j<=$((RETURNED_ITEMS - 1)); j++ )); do ITEM_ID=$(jq .features[$j]\ .id $WORKDIR/arcgis_list.json | sed 's/\"//g') ITEM_TITLE=$(jq .features[$j]\ .properties\ .title $WORKDIR/arcgis_list.json | sed 's/\"//g') ITEM_URL=$(jq .features[$j]\ .properties\ .url $WORKDIR/arcgis_list.json | sed 's/\"//g') ITEM_NAME=$(jq .features[$j]\ .properties\ .name $WORKDIR/arcgis_list.json | sed 's/\"//g') echo "Cur. article: $i.$j, ID : $ITEM_ID" echo " Cur. article: $i.$j, Title: $ITEM_TITLE" echo " Cur. article: $i.$j, URL : $ITEM_URL" echo " Cur. article: $i.$j, Name : $ITEM_NAME" rm -rf $STAGEDIR mkdir $STAGEDIR if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$STAGEDIR/$ITEM_NAME" -c -q echo " Downloaded." echo "Compressing." 7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR" fi if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]]; then MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')" echo " ^^^ Item is map. ($MAP_ID) " # https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1 # KML and GeoJSON use the spatial ID of 4326, all others use 26917 MAP_CSV="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=csv&spatialRefId=26917&where=1=1" MAP_SHP="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=shp&spatialRefId=26917&where=1=1" MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1" MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1" echo " Map URL (CSV) : $MAP_CSV" wget --user-agent="$WGET_UA" "$MAP_CSV" -O "$STAGEDIR/$ITEM_TITLE.csv" -c -q echo " Downloaded." echo " Map URL (Shapefile): $MAP_SHP" wget --user-agent="$WGET_UA" "$MAP_SHP" -O "$STAGEDIR/$ITEM_TITLE.shp" -c -q echo " Downloaded." echo " Map URL (GeoJSON) : $MAP_GEO" wget --user-agent="$WGET_UA" "$MAP_GEO" -O "$STAGEDIR/$ITEM_TITLE.geojson" -c -q echo " Downloaded." echo " Map URL (KML) : $MAP_KML" wget --user-agent="$WGET_UA" "$MAP_KML" -O "$STAGEDIR/$ITEM_TITLE.kml" -c -q echo " Downloaded." echo ' Source URL is $ITEM_URL.' echo "Compressing." 7z a -pAEF9D58B978A103B04016D600FD4B1E6943A3FF538B98B84F1C177B414F7018 "$MAPDIR/$ITEM_TITLE.7z" "$STAGEDIR" fi done if (( ($((i * 100)) + j) >= TOTAL_ITEMS)); then echo "No more items!" SEARCH_END=1 break fi ((i++)) done