diff --git a/SCRAPE_PLANAPPS.SH b/SCRAPE_PLANAPPS.SH index 22d993c..38e5ec0 100644 --- a/SCRAPE_PLANAPPS.SH +++ b/SCRAPE_PLANAPPS.SH @@ -62,7 +62,7 @@ while [[ $SEARCH_END == "FALSE" ]]; do wget --user-agent="$WGET_UA" $PROJECT_URL -O $PROJECT_PAGE --timestamping -q #--show-progress # Removing COVID is due to the naming in the 2020s. Keeping it for revisiting wayback crawls. - PROJECT_NAME=$(cat $PROJECT_PAGE | grep "field--name-title" | sed 's/.*]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-') + PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-') echo "SCRAPE_PLANAPPS: Found project: $PROJECT_NAME" echo "SCRAPE_PLANAPPS: Finding attachments..."