Compare commits
No commits in common. "3a76f2f5af6dfcb8be0569671b9fe2029191eba4" and "16c4905b4112a4842cded6099d88ba387107cfa4" have entirely different histories.
3a76f2f5af
...
16c4905b41
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
LondonArchive/
|
||||
LondonScrapers_privdata/
|
||||
tmp/
|
||||
staging/
|
||||
39
SCRAPE_AGIS.SH
Normal file → Executable file
39
SCRAPE_AGIS.SH
Normal file → Executable file
@ -7,6 +7,8 @@ echo -e "-=- Lillian Skinner
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
source ./functions/.functions
|
||||
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
ARGIS_URL="https://maps.london.ca/server/rest/services"
|
||||
@ -32,22 +34,33 @@ jq -r '.folders[]?' "$SERVICELIST_JSON" | while read -r FOLDER; do
|
||||
echo "Found $SERVICE"
|
||||
SERVICE_PATH="$FOLDER/$SERVICE"
|
||||
echo "$ARGIS_URL/$SERVICE/MapServer"
|
||||
wget "$ARGIS_URL/$SERVICE/MapServer?f=json" --user-agent="$WGET_UA" -O "$SERVICE_JSON" -q
|
||||
wget "$ARGIS_URL/$SERVICE/MapServer?f=json" --user-agent="$WGET_UA" -O "$SERVICE_JSON" -q
|
||||
|
||||
rm -r "$TMP_STAGING"
|
||||
mkdir "$TMP_STAGING"
|
||||
mkdir -p "LondonArchive/ArcGIS/${SERVICE}"
|
||||
jq -r '.layers[]? | "\(.id)|\(.name)"' "$SERVICE_JSON" | while IFS='|' read -r LAYERID LAYERNAME; do
|
||||
rm -r "$TMP_STAGING"
|
||||
mkdir "$TMP_STAGING"
|
||||
|
||||
echo "Downloading $LAYERID-$LAYERNAME..."
|
||||
curl -s \
|
||||
"$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query\
|
||||
?where=1=1\
|
||||
&outFields=*\
|
||||
&returnGeometry=true\
|
||||
&f=geojson" \
|
||||
-o "$TMP_STAGING/layer${LAYERID}-${LAYERNAME}.geojson"
|
||||
LAYERNAME_CLEAN=$(echo $LAYERNAME | sed 's/\// /g' | sed 's/\\/ /g' | sed -E 's/ {2,}/ /g')
|
||||
|
||||
curl -s "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&returnCountOnly=true&f=json" -o "$TMP/count.json"
|
||||
ITEM_COUNT=$(jq -r '.count' "$TMP/count.json")
|
||||
MAX_REQUESTS=2000
|
||||
i=0
|
||||
j=0
|
||||
|
||||
while (( i <= ITEM_COUNT )); do
|
||||
echo "Downloading $LAYERID-${j} $LAYERNAME_CLEAN"
|
||||
echo "$i of $ITEM_COUNT"
|
||||
|
||||
_utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=geojson" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.geojson"
|
||||
echo "Done GeoJSON!"
|
||||
_utils_download_helper "$ARGIS_URL/$SERVICE/MapServer/$LAYERID/query?where=1=1&outFields=*&returnGeometry=true&resultOffset=${i}&resultRecordCount=${MAX_REQUESTS}&f=kmz" "$TMP_STAGING/Layer ${LAYERID}-${j} - ${LAYERNAME_CLEAN}.kmz"
|
||||
echo "Done KMZ!"
|
||||
i=$(( i + MAX_REQUESTS ))
|
||||
((j++))
|
||||
done
|
||||
7z a "LondonArchive/ArcGIS/${SERVICE}/Layer ${LAYERID} - ${LAYERNAME_CLEAN}.7z" "$TMP_STAGING"
|
||||
done
|
||||
mkdir -p "LondonArchive/ArcGIS/${FOLDER}/${SERVICE}"
|
||||
7z a "LondonArchive/ArcGIS/${FOLDER}/${SERVICE}/layers.7z" "$TMP_STAGING"
|
||||
done
|
||||
done
|
||||
|
||||
0
SCRAPE_ESCRIBE.SH
Normal file → Executable file
0
SCRAPE_ESCRIBE.SH
Normal file → Executable file
329
SCRAPE_GINV.SH
Executable file
329
SCRAPE_GINV.SH
Executable file
@ -0,0 +1,329 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
source ./functions/.functions
|
||||
|
||||
# Todo:
|
||||
# - Save updates (see bradley-ave)
|
||||
# - Order, title, and collapse each scraped modal
|
||||
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
TEMP_DIR="./tmp/"
|
||||
SEARCH_PAGE="./tmp/index_ginv.html"
|
||||
PROJECT_PAGE="./tmp/project_ginv.html"
|
||||
WORK_HTML="./tmp/tmp.html"
|
||||
CUSTOM_HTML="./tmp/custom_ginv.html"
|
||||
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
|
||||
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
|
||||
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
|
||||
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
|
||||
CUSTOM_HTML_TIMELINE="./tmp/custom_timeline_ginv.html"
|
||||
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
|
||||
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
|
||||
FULLDUMP="./tmp/.fulldump.txt"
|
||||
|
||||
current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)
|
||||
|
||||
rm -f $SEARCH_PAGE
|
||||
|
||||
mkdir $TEMP_DIR
|
||||
|
||||
SEARCH_URL="https://getinvolved.london.ca/projects"
|
||||
|
||||
set_metadata() {
|
||||
tmp=$(echo "$1" | sed 's/&/\&/g' | sed 's/"//g' | sed 's/'/'\''/g' | sed 's/\[/''/g' | sed 's/\]/''/g')
|
||||
PROJECT_NAME=$(_utils_fix_dashes "$(echo $tmp | sed 's/.*data-project-name="\([^"]*\)".*/\1/' | sed 's/‘//g' | sed 's/’//g' | sed 's/'\''//g' | sed 's/://g')")
|
||||
PROJECT_CATS=$(echo "$tmp" | sed 's/.*data-project-category="\([^"]*\)".*/\1/')
|
||||
PROJECT_LOCATION=$(echo "$tmp" | sed 's/.*data-project-location="\([^"]*\)".*/\1/')
|
||||
}
|
||||
|
||||
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
while IFS= read -r LINE; do
|
||||
|
||||
if (( FOUND_DATE )) && [[ "$LAST_LINE" == "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
|
||||
FOUND_DATE=0
|
||||
echo $PROJECT_URL
|
||||
echo $PROJECT_NAME
|
||||
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
|
||||
# Now we can work on the actual project page.
|
||||
rm -f $CUSTOM_HTML_LINKS
|
||||
rm -f $CUSTOM_HTML_PHOTOS
|
||||
rm -f $CUSTOM_HTML_FAQ
|
||||
rm -f $CUSTOM_HTML_PROFILE
|
||||
rm -f $CUSTOM_HTML_TIMELINE
|
||||
rm -f $CUSTOM_HTML_KEYDATES
|
||||
rm -f $CUSTOM_HTML_SLIDER
|
||||
rm -f $FULLDUMP
|
||||
|
||||
cat ./template/default_getinvolved.html > $CUSTOM_HTML
|
||||
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
|
||||
while IFS= read -r LINE_PROJ; do
|
||||
if (( NEXT_LINE_CONTENT )); then
|
||||
# Next hive-block marks end of current item
|
||||
if [[ "$LINE_PROJ" == *"hive-block"* ]] || [[ "$LINE_PROJ" == "" ]]; then
|
||||
NEXT_LINE_CONTENT=0
|
||||
echo "End of current content."
|
||||
else
|
||||
# Ignore boring notices
|
||||
if [[ "$LINE_PROJ" != *"</h1>"* ]] &&
|
||||
[[ "$LINE_PROJ" != *"City of London Land Acknowledgement"* ]] &&
|
||||
[[ "$LINE_PROJ" != *"Ongoing Site Specific Planning Applications"* ]] &&
|
||||
[[ "$LINE_PROJ" != *"This site is owned and operated by the City of London using software licensed from Social Pinpoint"* ]] &&
|
||||
[[ "$LINE_PROJ" != *"Social Pinpoint has been commissioned by City of London (Canada) to collect and display user content on their behalf"* ]] &&
|
||||
[[ "$LINE_PROJ" != *"Notice of Collection of Personal Information"* ]] &&
|
||||
[[ "$LINE_PROJ" != *'href="/register"'* ]] &&
|
||||
[[ "$LINE_PROJ" != *'href="/login"'* ]] &&
|
||||
[[ "$LINE_PROJ" != *"Users have the right to access, correct, or delete their personal information"* ]] &&
|
||||
[[ "$LINE_PROJ" != *"This privacy policy may change from time to time"* ]] &&
|
||||
#[[ "$LINE_PROJ" != *"Share your feedback"* ]] &&
|
||||
[[ "$LINE_PROJ" != *"Notice of Collection"* ]] &&
|
||||
#[[ "$LINE_PROJ" != *"Subscribe for project updates"* ]] &&
|
||||
[[ "$LINE_PROJ" != *"Ready to have your say?"* ]]; then
|
||||
# seds to replace youtube iframe with a normal <a href=""> link. wkhtmltopdf obviously can't embed youtube videos.
|
||||
if (( FIRST_CONTENT )); then
|
||||
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
|
||||
FIRST_CONTENT=0
|
||||
fi
|
||||
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
|
||||
echo $(echo " $LINE_PROJ" | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_DOC_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
|
||||
IS_DOC_BLOCK=0
|
||||
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/download_file/")
|
||||
if [[ "$FOUND_LINK" == "" ]]; then
|
||||
rm -f $CUSTOM_HTML_LINKS
|
||||
fi
|
||||
echo "End of current documents."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_PHOTO_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"<!-- end foreach -->"* ]]; then
|
||||
IS_PHOTO_BLOCK=0
|
||||
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||||
if [[ "$FOUND_LINK" == "" ]]; then
|
||||
rm -f $CUSTOM_HTML_PHOTOS
|
||||
fi
|
||||
echo "End of current photos."
|
||||
else
|
||||
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
|
||||
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_FAQ_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
|
||||
IS_FAQ_BLOCK=0
|
||||
echo "End of current FAQ."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||||
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
|
||||
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
|
||||
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
|
||||
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_PROFILE_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"<script>"* ]]; then
|
||||
IS_PROFILE_BLOCK=0
|
||||
echo "End of current profile."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_TIMELINE_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"btn-unfill btn-primary"* ]]; then
|
||||
IS_TIMELINE_BLOCK=0
|
||||
echo "End of current timeline."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_KEYDATES_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"modal-footer"* ]]; then
|
||||
IS_KEYDATES_BLOCK=0
|
||||
echo "End of current key dates."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_SLIDER_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
|
||||
IS_SLIDER_BLOCK=0
|
||||
echo "End of current key dates."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_SINGLE_IMAGE_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == "" ]]; then
|
||||
IS_SINGLE_IMAGE_BLOCK=0
|
||||
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||||
if [[ "$FOUND_LINK" == "" ]]; then
|
||||
rm -f $CUSTOM_HTML_PHOTOS
|
||||
else
|
||||
cat "$CUSTOM_HTML_PHOTOS"
|
||||
fi
|
||||
echo "End of current single image."
|
||||
else
|
||||
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$LINE_PROJ" == *"hive-block hive-block-content ljs"* ]]; then
|
||||
NEXT_LINE_CONTENT=1
|
||||
FIRST_CONTENT=1
|
||||
# We'll write the LA comment inside of the content block.
|
||||
# There we can ensure that the comment is only written if content does exist.
|
||||
echo "Found content start."
|
||||
elif [[ "$LINE_PROJ" == *"docLibModal hive-block-document-library"* ]]; then
|
||||
IS_DOC_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
|
||||
echo "Found documents start."
|
||||
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
|
||||
IS_PHOTO_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
|
||||
echo "Found photos start."
|
||||
elif [[ "$LINE_PROJ" == *"hive-modal faqModal hive-block-faq"* ]]; then
|
||||
IS_FAQ_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
|
||||
echo "Found FAQ start."
|
||||
elif [[ "$LINE_PROJ" == *"hive-block-bio hive-block"* ]]; then
|
||||
IS_PROFILE_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
|
||||
echo "Found profile start."
|
||||
elif [[ "$LINE_PROJ" == *"hive-block-timeline hive-block"* ]]; then
|
||||
IS_TIMELINE_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Timeline -->" >> $FULLDUMP
|
||||
echo "Found timeline start."
|
||||
elif [[ "$LINE_PROJ" == *"hive-modal dateModal"* ]]; then
|
||||
IS_KEYDATES_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
|
||||
echo "Found key dates start."
|
||||
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
|
||||
IS_SLIDER_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
|
||||
echo "Found slider start."
|
||||
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
|
||||
IS_SINGLE_IMAGE_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
|
||||
echo "Found single image start."
|
||||
fi
|
||||
done < $PROJECT_PAGE
|
||||
|
||||
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
|
||||
#cat "$CUSTOM_HTML_LINKS" >> "$CUSTOM_HTML"
|
||||
|
||||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
|
||||
|
||||
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
|
||||
while IFS= read -r LINE_DOC; do
|
||||
if [[ "$LINE_DOC" == *"download_file"* ]]; then
|
||||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||
DOC_NAME=$(curl -s -L -I "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" | grep "location:" | sed 's/location: //' | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//' | sed 's/\.pdf./\.pdf/')
|
||||
echo $DOC_NAME
|
||||
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||||
fi
|
||||
done < $CUSTOM_HTML_LINKS
|
||||
fi
|
||||
|
||||
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
|
||||
while IFS= read -r LINE_DOC; do
|
||||
if [[ "$LINE_DOC" == *"amazonaws"* ]]; then
|
||||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
|
||||
echo $DOC_NAME
|
||||
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||||
fi
|
||||
done < $CUSTOM_HTML_PHOTOS
|
||||
fi
|
||||
|
||||
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
|
||||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||
cat ./template/default_getinvolved.html > $WORK_HTML
|
||||
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
|
||||
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
|
||||
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
|
||||
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
|
||||
fi
|
||||
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
|
||||
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
|
||||
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
|
||||
fi
|
||||
|
||||
if (( NEXT_LINE_URL )); then
|
||||
NEXT_LINE_URL=0
|
||||
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
|
||||
fi
|
||||
|
||||
if [[ "$LAST_LINE" != "" ]]; then
|
||||
set_metadata "$LAST_LINE$LINE"
|
||||
LAST_LINE=""
|
||||
NEXT_LINE_URL=1
|
||||
elif [[ "$LINE" == *"h-entry project card"* ]] && [[ "$LINE" == *"data-project-name"* ]] && [[ "$LINE" != *"<%-"* ]]; then
|
||||
#echo $LINE
|
||||
if [[ "$LINE" != *"data-project-category"* ]]; then
|
||||
# Sometimes lines are split, so we'll combine the pieces over time.
|
||||
LAST_LINE=$LINE
|
||||
echo "Line is split!"
|
||||
else
|
||||
LAST_LINE=""
|
||||
set_metadata "$LINE"
|
||||
NEXT_LINE_URL=1
|
||||
fi
|
||||
elif [[ "$LINE" == *'time class="dt-updated"'* ]]; then
|
||||
PROJECT_DATE=$(echo $LINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g')
|
||||
echo $PROJECT_DATE
|
||||
_time_parse_monddyyyy "$PROJECT_DATE"
|
||||
echo "$ITEM_YEAR$ITEM_MONTH$ITEM_DAY"
|
||||
FOUND_DATE=1
|
||||
fi
|
||||
|
||||
done < $SEARCH_PAGE
|
||||
fi
|
||||
301
SCRAPE_GINV_OLD.SH
Executable file
301
SCRAPE_GINV_OLD.SH
Executable file
@ -0,0 +1,301 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
source ./functions/.functions
|
||||
|
||||
# Todo:
|
||||
# - Save updates (see bradley-ave)
|
||||
# - Order, title, and collapse each scraped modal
|
||||
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
TEMP_DIR="./tmp/"
|
||||
SEARCH_PAGE="./tmp/index_ginv.html"
|
||||
PROJECT_PAGE="./tmp/project_ginv.html"
|
||||
WORK_HTML="./tmp/tmp.html"
|
||||
CUSTOM_HTML="./tmp/custom_ginv.html"
|
||||
CUSTOM_HTML_LINKS="./tmp/custom_link_ginv.html"
|
||||
CUSTOM_HTML_PHOTOS="./tmp/custom_photo_ginv.html"
|
||||
CUSTOM_HTML_FAQ="./tmp/custom_faq_ginv.html"
|
||||
CUSTOM_HTML_PROFILE="./tmp/custom_profile_ginv.html"
|
||||
CUSTOM_HTML_UPDATE="./tmp/custom_update_ginv.html"
|
||||
CUSTOM_HTML_KEYDATES="./tmp/custom_keydates_ginv.html"
|
||||
CUSTOM_HTML_SLIDER="./tmp/custom_slider_ginv.html"
|
||||
FULLDUMP="./tmp/.fulldump.txt"
|
||||
|
||||
rm -f $SEARCH_PAGE
|
||||
|
||||
mkdir $TEMP_DIR
|
||||
|
||||
SEARCH_URL="https://getinvolvedlondon.ca.engagementhq.com"
|
||||
|
||||
wget --user-agent="$WGET_UA" "$SEARCH_URL/projects" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
while IFS= read -r LINE; do
|
||||
|
||||
if [[ "$PROJECT_NAME" != "" ]]; then
|
||||
FOUND_DATE=0
|
||||
echo $PROJECT_URL
|
||||
echo $PROJECT_NAME
|
||||
wget --user-agent="$WGET_UA" "$PROJECT_URL" -O $PROJECT_PAGE --timestamping -q #--show-progress
|
||||
# Now we can work on the actual project page.
|
||||
rm -f $CUSTOM_HTML_LINKS
|
||||
rm -f $CUSTOM_HTML_PHOTOS
|
||||
rm -f $CUSTOM_HTML_FAQ
|
||||
rm -f $CUSTOM_HTML_PROFILE
|
||||
rm -f $CUSTOM_HTML_UPDATE
|
||||
rm -f $CUSTOM_HTML_KEYDATES
|
||||
rm -f $CUSTOM_HTML_SLIDER
|
||||
rm -f $FULLDUMP
|
||||
|
||||
cat ./template/default_getinvolved.html > $CUSTOM_HTML
|
||||
echo "<h1>$PROJECT_NAME</h1>" >> $CUSTOM_HTML
|
||||
while IFS= read -r LINE_PROJ; do
|
||||
|
||||
if (( IS_DOC_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == "</ul>" ]]; then
|
||||
IS_DOC_BLOCK=0
|
||||
FOUND_LINK=$(cat "$CUSTOM_HTML_LINKS" | grep "/documents/")
|
||||
if [[ "$FOUND_LINK" == "" ]]; then
|
||||
rm -f $CUSTOM_HTML_LINKS
|
||||
fi
|
||||
echo "End of current documents."
|
||||
elif [[ "$LINE_PROJ" == *"a data-url"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML_LINKS
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
echo $LINE_PROJ
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_PHOTO_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == "" ]]; then
|
||||
IS_PHOTO_BLOCK=0
|
||||
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||||
if [[ "$FOUND_LINK" == "" ]]; then
|
||||
rm -f $CUSTOM_HTML_PHOTOS
|
||||
fi
|
||||
echo "End of current photos."
|
||||
else
|
||||
if [[ "$LINE_PROJ" == *'aria-label="'* ]] && [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $CUSTOM_HTML_PHOTOS
|
||||
echo $(echo $LINE_PROJ | sed 's/.*href="\([^"]*\)".*/\1/') >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_FAQ_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"div class='clearfix'></div"* ]]; then
|
||||
IS_FAQ_BLOCK=0
|
||||
echo "End of current FAQ."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||||
# I don't care that this is invalid HTML. All you'll see in the end is a nicely formatted PDF.
|
||||
if [[ "$LINE_PROJ" == *"hive-block-faq mod-reverse"* ]]; then
|
||||
echo $(echo $LINE_PROJ | sed 's/<a role/<h3 role/g' | sed 's/<\/a>/<\/h3>/g') >> $CUSTOM_HTML
|
||||
elif [[ "$LINE_PROJ" != *"</h1>"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_PROFILE_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"<!--[if IE]>"* ]]; then
|
||||
IS_PROFILE_BLOCK=0
|
||||
echo "End of current profile."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn btn-close btn-inverse close"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_UPDATE_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"<div class='clearfix'></div>"* ]]; then
|
||||
IS_UPDATE_BLOCK=0
|
||||
echo "End of current update."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn-unfill btn-primary"* ]] && [[ "$LINE_PROJ" != *'class="sr-only"'* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_KEYDATES_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == "" ]]; then
|
||||
IS_KEYDATES_BLOCK=0
|
||||
echo "End of current key dates."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"btn-close btn-inverse close"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_SLIDER_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == *"<!-- Controls -->"* ]]; then
|
||||
IS_SLIDER_BLOCK=0
|
||||
echo "End of current key dates."
|
||||
else
|
||||
if [[ "$LINE_PROJ" != *"btn btn-default"* ]] && [[ "$LINE_PROJ" != *"</h3"* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML_SLIDER
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( IS_SINGLE_IMAGE_BLOCK )); then
|
||||
if [[ "$LINE_PROJ" == "" ]]; then
|
||||
IS_SINGLE_IMAGE_BLOCK=0
|
||||
FOUND_LINK=$(cat "$CUSTOM_HTML_PHOTOS" | grep "amazonaws")
|
||||
if [[ "$FOUND_LINK" == "" ]]; then
|
||||
rm -f $CUSTOM_HTML_PHOTOS
|
||||
else
|
||||
cat "$CUSTOM_HTML_PHOTOS"
|
||||
fi
|
||||
echo "End of current single image."
|
||||
else
|
||||
if [[ "$LINE_PROJ" == *'class="hive-image"'* ]]; then
|
||||
echo $LINE_PROJ >> $CUSTOM_HTML
|
||||
echo $LINE_PROJ >> $FULLDUMP
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$LINE_PROJ" == *'div class="full-description hide"'* ]]; then
|
||||
FIRST_CONTENT=1
|
||||
# We'll write the LA comment inside of the content block.
|
||||
# There we can ensure that the comment is only written if content does exist.
|
||||
echo "Found content start."
|
||||
|
||||
if (( FIRST_CONTENT )); then
|
||||
echo "<!-- LondonArchive_GINV_Body -->" >> $FULLDUMP
|
||||
FIRST_CONTENT=0
|
||||
fi
|
||||
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $CUSTOM_HTML
|
||||
echo $(echo " $LINE_PROJ" | sed 's/.*<div class="full-description hide">/<div>/' | sed 's/src="https:\/\/www\.youtube\.com\/embed/href="https:\/\/www\.youtube\.com\/watch/' | sed 's/<iframe/<a/' | sed 's/<\/iframe>/YouTube Link<\/a><\/br>/') >> $FULLDUMP
|
||||
|
||||
elif [[ "$LINE_PROJ" == *"widget-wrap widget_document_library"* ]]; then
|
||||
IS_DOC_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Documents -->" >> $FULLDUMP
|
||||
echo "Found documents start."
|
||||
elif [[ "$LINE_PROJ" == *"hive-block-media hive-block"* ]]; then
|
||||
IS_PHOTO_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Photos -->" >> $FULLDUMP
|
||||
echo "Found photos start."
|
||||
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_recent_photos'"* ]]; then
|
||||
IS_FAQ_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_FAQ -->" >> $FULLDUMP
|
||||
echo "Found FAQ start."
|
||||
elif [[ "$LINE_PROJ" == *"widget-wrap widget_project_team"* ]]; then
|
||||
IS_PROFILE_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Bio -->" >> $FULLDUMP
|
||||
echo "Found profile start."
|
||||
elif [[ "$LINE_PROJ" == *"<div class='fr-view'>"* ]]; then
|
||||
IS_UPDATE_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Update -->" >> $FULLDUMP
|
||||
echo "<h1>Project Updates</h1>" >> $CUSTOM_HTML_UPDATE
|
||||
echo "Found update start."
|
||||
elif [[ "$LINE_PROJ" == *"div class='widget-wrap widget_life_cycle'"* ]]; then
|
||||
IS_KEYDATES_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Date -->" >> $FULLDUMP
|
||||
echo "Found key dates start."
|
||||
elif [[ "$LINE_PROJ" == *"<!-- Wrapper for slider -->"* ]]; then
|
||||
IS_SLIDER_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_Slider -->" >> $FULLDUMP
|
||||
echo "Found slider start."
|
||||
elif [[ "$LINE_PROJ" == *"hive-block hive-block-image"* ]]; then
|
||||
IS_SINGLE_IMAGE_BLOCK=1
|
||||
echo "<!-- LondonArchive_GINV_SingleImage -->" >> $FULLDUMP
|
||||
echo "Found single image start."
|
||||
fi
|
||||
done < $PROJECT_PAGE
|
||||
|
||||
#cat "$CUSTOM_HTML_FAQ" >> "$CUSTOM_HTML"
|
||||
cat "$CUSTOM_HTML_LINKS" # >> "$CUSTOM_HTML"
|
||||
|
||||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/"
|
||||
|
||||
if [ -e "$CUSTOM_HTML_LINKS" ] && [ -s "$CUSTOM_HTML_LINKS" ]; then
|
||||
while IFS= read -r LINE_DOC; do
|
||||
if [[ "$LINE_DOC" == *"/documents/"* ]]; then
|
||||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||
DOC_NAME="$(echo $LINE_DOC | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/g' | sed 's/ (pdf)//' | sed 's/^ +| +$//g').pdf"
|
||||
echo "-------- "$DOC_NAME
|
||||
_utils_download_helper "$(echo $LINE_DOC | sed 's/.*href="\([^"]*\)".*/\1/')/download" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||||
fi
|
||||
done < $CUSTOM_HTML_LINKS
|
||||
fi
|
||||
|
||||
if [ -e "$CUSTOM_HTML_PHOTOS" ] && [ -s "$CUSTOM_HTML_PHOTOS" ]; then
|
||||
while IFS= read -r LINE_DOC; do
|
||||
if [[ "$LINE_DOC" == *"ehq-production"* ]]; then
|
||||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||
DOC_NAME=$(echo $LINE_DOC | sed 's/.*\///' | sed 's/^[0-9a-f]\{32\}_//')
|
||||
echo "======== "$DOC_NAME
|
||||
_utils_download_helper "$LINE_DOC" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/$DOC_NAME"
|
||||
fi
|
||||
done < $CUSTOM_HTML_PHOTOS
|
||||
fi
|
||||
|
||||
if [ -e "$CUSTOM_HTML_SLIDER" ] && [ -s "$CUSTOM_HTML_SLIDER" ]; then
|
||||
mkdir -p "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments"
|
||||
cat ./template/default_getinvolved.html > $WORK_HTML
|
||||
echo "<h1>$PROJECT_NAME Photo Gallery</h1>" >> $WORK_HTML
|
||||
cat "$CUSTOM_HTML_SLIDER" >> $WORK_HTML
|
||||
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $WORK_HTML
|
||||
wkhtmltopdf --image-quality 100 "$WORK_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Attachments/${PROJECT_NAME}_GALLERY.pdf"
|
||||
fi
|
||||
echo "<br><br><small><i>Automatically generated for the London Archive on $(date)</i></small>" >> $CUSTOM_HTML
|
||||
wkhtmltopdf --image-quality 100 "$CUSTOM_HTML" "./LondonArchive/GetInvolved/$PROJECT_NAME/Main.pdf"
|
||||
cp "$FULLDUMP" "./LondonArchive/GetInvolved/$PROJECT_NAME/.backup.txt"
|
||||
|
||||
IS_DOC_BLOCK=0
|
||||
IS_PHOTO_BLOCK=0
|
||||
IS_FAQ_BLOCK=0
|
||||
IS_PROFILE_BLOCK=0
|
||||
IS_UPDATE_BLOCK=0
|
||||
IS_KEYDATES_BLOCK=0
|
||||
IS_SLIDER_BLOCK=0
|
||||
IS_SINGLE_IMAGE_BLOCK=0
|
||||
|
||||
PROJECT_NAME=""
|
||||
fi
|
||||
|
||||
if (( NEXT_LINE_CONT_NAME )); then
|
||||
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LAST_LINE$LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's/‘//g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's/’//g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/"//g' | sed 's/&/and/g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
|
||||
NEXT_LINE_CONT_NAME=0
|
||||
echo $PROJECT_NAME
|
||||
fi
|
||||
|
||||
if [[ "$LINE" == *"project-tile__meta__name"* ]]; then
|
||||
if [[ "$LINE" != *"</span"* ]]; then
|
||||
NEXT_LINE_CONT_NAME=1
|
||||
LAST_LINE=$LINE
|
||||
else
|
||||
PROJECT_NAME=$(_utils_fix_dashes "$(echo $LINE | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/g' | sed 's/‘//g' | sed 's/\// and /g' | sed 's/\\/ and /g' | sed 's/’//g' | sed 's/'\''//g' | sed 's/://g' | sed 's/®//g' | sed 's/"//g' | sed 's/&/and/g' | sed 's/'//g' | sed 's/amp;//g' | sed 's/^ +| +$//g')")
|
||||
echo $PROJECT_NAME
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$LINE" == *"project-tile__link"* ]]; then
|
||||
PROJECT_URL=$(echo $LINE | sed 's/.*href="\([^"]*\)".*/\1/')
|
||||
PROJECT_URL=$(echo $SEARCH_URL$PROJECT_URL)
|
||||
echo " "$PROJECT_URL
|
||||
# Reset project name to mark the start of a new project
|
||||
PROJECT_NAME=""
|
||||
fi
|
||||
|
||||
done < $SEARCH_PAGE
|
||||
fi
|
||||
39
SCRAPE_LPS.SH
Normal file → Executable file
39
SCRAPE_LPS.SH
Normal file → Executable file
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_LPS.SH: Downloads LPS committee agendas and minutes -=-"
|
||||
@ -8,28 +8,7 @@ echo -e "-=- Lillian Skinner
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
conv_date() {
|
||||
echo "$1"
|
||||
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
||||
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
case "$MEETING_MONTH_WORD" in
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
source ./functions/.functions
|
||||
|
||||
MEETINGS_PAGE="./tmp.html"
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
@ -45,9 +24,9 @@ current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)
|
||||
# If I don't set these values then "10#: invalid integer constant"
|
||||
MEETING_YEAR="0000"
|
||||
MEETING_MONTH="00"
|
||||
MEETING_DAY="00"
|
||||
ITEM_YEAR="0000"
|
||||
ITEM_MONTH="00"
|
||||
ITEM_DAY="00"
|
||||
|
||||
while IFS= read -r LINE_PRE; do
|
||||
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
||||
@ -66,11 +45,11 @@ while IFS= read -r LINE_PRE; do
|
||||
|
||||
FOUND_LINK=$(echo $LINE | grep 'a href="' | grep ".pdf" | grep '<td valign="top">')
|
||||
if [[ "$ATTACH_TYPE" != "" ]] && [[ "$FOUND_LINK" != "" ]]; then
|
||||
conv_date "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
|
||||
echo "$MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
_time_parse_helper "$(echo $FOUND_LINK | sed 's/.*<a[^>]*>\([^<]*\)<[\/:-]a>.*/\1/' | sed -e 's/\([0-9]\{4\}\).*/\1/' | sed -e 's/^[[:space:]]*//g; s/[[:space:]]*$//g')"
|
||||
echo "$ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||
echo "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')"
|
||||
mkdir -p "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/"
|
||||
wget "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" -O "./LondonArchive/LPS/Board/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/$ATTACH_TYPE.pdf" -q
|
||||
mkdir -p "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/"
|
||||
_utils_download_helper "$(echo $FOUND_LINK | sed 's/.*href="\([^"]*\)".*/\1/')" "./LondonArchive/LPS/Board/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/$ATTACH_TYPE.pdf"
|
||||
fi
|
||||
|
||||
done < "./tmp/index.html"
|
||||
|
||||
55
SCRAPE_LTC.SH
Normal file → Executable file
55
SCRAPE_LTC.SH
Normal file → Executable file
@ -7,6 +7,8 @@ echo -e "-=- Lillian Skinner
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
source ./functions/.functions
|
||||
|
||||
MEETINGS_PAGE="./tmp.html"
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
@ -21,9 +23,9 @@ current_year=$(date +%Y)
|
||||
current_month=$(date +%m)
|
||||
current_day=$(date +%d)
|
||||
# If I don't set these values then "10#: invalid integer constant"
|
||||
MEETING_YEAR="0000"
|
||||
MEETING_MONTH="00"
|
||||
MEETING_DAY="00"
|
||||
ITEM_YEAR="0000"
|
||||
ITEM_MONTH="00"
|
||||
ITEM_DAY="00"
|
||||
|
||||
while IFS= read -r LINE_PRE; do
|
||||
LINE=$(echo $LINE_PRE | sed 's/\xC2\xA0/ /')
|
||||
@ -47,33 +49,16 @@ while IFS= read -r LINE_PRE; do
|
||||
elif [[ "$GREPDATE" != "" ]]; then
|
||||
# Remove HTML junk from date string.
|
||||
DATES_CLEAN=$(echo $GREPDATE | sed 's/.*<strong>//' | sed 's/<\/strong>.*//' | sed 's/<span.*//' | sed -e 's/[[:space:]]*$//' | sed 's/\.//')
|
||||
MEETING_MONTH_WORD=$(echo "$DATES_CLEAN" | sed -E 's/^([A-Za-z]+) .*/\1/')
|
||||
MEETING_DAY_SHORT=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/')
|
||||
MEETING_DAY=$(printf "%02d" ${MEETING_DAY_SHORT#0})
|
||||
MEETING_YEAR=$(echo "$DATES_CLEAN" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/')
|
||||
|
||||
case "$MEETING_MONTH_WORD" in
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
esac
|
||||
_time_parse_helper "$DATES_CLEAN"
|
||||
|
||||
echo " NEW MEETING FOUND"
|
||||
echo " DATE IS $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
echo " DATE IS $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||
GREPDATE=""
|
||||
else
|
||||
# Has a previous meeting has been set? What about a date?
|
||||
# Remove comparison to current dates in order to download full page. Adding this for automated LA scripts.
|
||||
if [[ "COMMITTEENAME" != "" ]] && [[ "MEETING_YEAR" != "" ]] && (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
|
||||
if [[ "COMMITTEENAME" != "" ]] && [[ "ITEM_YEAR" != "" ]] && (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
|
||||
# Not changing meetings, and we know that an old meeting has alread been set. Keep going.
|
||||
|
||||
# If match --> make folder --> download
|
||||
@ -85,25 +70,25 @@ while IFS= read -r LINE_PRE; do
|
||||
# Well... this aged well.
|
||||
if [[ "$AGENDAURL" != "" || "$MINUTESURL" != "" || "$AGENDAHTMLURL" != "" || "$MINUTESHTMLURL" != "" ]]; then
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG" 2> /dev/null
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR" 2> /dev/null
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY" 2> /dev/null
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR" 2> /dev/null
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY" 2> /dev/null
|
||||
if [[ "$AGENDAURL" != "" ]]; then
|
||||
echo " DOWNLOAD AGENDA PDF"
|
||||
echo " $AGENDAURL"
|
||||
wget --user-agent="$WGET_UA" "$AGENDAURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Agenda.pdf" -c -q #--show-progress
|
||||
_utils_download_helper "$AGENDAURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Agenda.pdf"
|
||||
elif [[ "$MINUTESURL" != "" ]]; then
|
||||
echo " DOWNLOAD MINUTES PDF"
|
||||
echo " $MINUTESURL"
|
||||
wget --user-agent="$WGET_UA" "$MINUTESURL" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Minutes.pdf" -c -q #--show-progress
|
||||
_utils_download_helper "$MINUTESURL" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Minutes.pdf"
|
||||
elif [[ "$AGENDAHTMLURL" != "" ]] || [[ "$MINUTESHTMLURL" != "" ]]; then
|
||||
if [[ "$AGENDAHTMLURL" != "" ]]; then
|
||||
echo " DOWNLOAD AGENDA HTML TO CRAWL"
|
||||
echo " $AGENDAHTMLURL"
|
||||
wget --user-agent="$WGET_UA" "$AGENDAHTMLURL" -O "./tmp/work.html" -q #--show-progress
|
||||
_utils_download_helper "$AGENDAHTMLURL" "./tmp/work.html"
|
||||
elif [[ "$MINUTESHTMLURL" != "" ]]; then
|
||||
echo " DOWNLOAD MINUTES HTML TO CRAWL"
|
||||
echo " $MINUTESHTMLURL"
|
||||
wget --user-agent="$WGET_UA" "$MINUTESHTMLURL" -O "./tmp/work.html" -q #--show-progress
|
||||
_utils_download_helper "$MINUTESHTMLURL" "./tmp/work.html"
|
||||
fi
|
||||
while IFS= read -r LINE_HTML_PRE; do
|
||||
LINE_HTML=$(echo $LINE_HTML_PRE | sed 's/\xC2\xA0/ /')
|
||||
@ -117,20 +102,20 @@ while IFS= read -r LINE_PRE; do
|
||||
echo " END OF INDEX ARTICLE"
|
||||
ISARTICLE=0
|
||||
elif [[ "$GREPLINK" != "" ]] && (( ISARTICLE )); then
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
|
||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments" 2> /dev/null
|
||||
ISPDF=$(echo $GREPLINK | grep "\.pdf")
|
||||
if [[ "$ISPDF" != "" ]]; then
|
||||
PDFNAME=$(echo $ISPDF | sed 's/.*\///')
|
||||
echo " DOWNLOAD ATTACHMENT PDF"
|
||||
echo " $ISPDF"
|
||||
wget --user-agent="$WGET_UA" "$ISPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFNAME" -c -q #--show-progress
|
||||
_utils_download_helper "$ISPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFNAME"
|
||||
else
|
||||
# Extract title of attachment
|
||||
ATTACHTITLE=$(echo $LINE_HTML | sed 's/<sup>//g' | sed 's/<\/sup>//g' | sed -n 's/.*<a href=".*">\([^<]*\)<\/a>.*/\1/p' | sed 's/&/and/g' | sed 's/&.....;./ /g' | perl -CS -pe 's/[\x{2013}\x{2014}\x{2012}\x{2015}\x{2212}]//g' | sed 's/ / /g' | sed 's/ / /g')
|
||||
echo " DOWNLOAD ATTACHMENT HTML"
|
||||
echo " $ATTACHTITLE"
|
||||
echo " $GREPLINK"
|
||||
wget --user-agent="$WGET_UA" "$GREPLINK" -O "./tmp/attachment.html" -q #--show-progress
|
||||
_utils_download_helper "$GREPLINK" "./tmp/attachment.html"
|
||||
while IFS= read -r LINE_ATTACH_PRE; do
|
||||
LINE_ATTACH=$(echo $LINE_ATTACH_PRE | sed 's/\xC2\xA0/ /')
|
||||
GREPATTACHMENTARTICLESTART=$(echo $LINE_ATTACH | grep "<article")
|
||||
@ -146,7 +131,7 @@ while IFS= read -r LINE_PRE; do
|
||||
echo " END OF ATTACHMENT ARTICLE"
|
||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||
echo " PROCESSED TO PDF"
|
||||
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
|
||||
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
|
||||
ISATTACHMENTARTICLE=0
|
||||
elif [[ "$GREPATTACHMENTLINK" != "" ]] && (( ISATTACHMENTARTICLE )); then
|
||||
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
|
||||
@ -154,7 +139,7 @@ while IFS= read -r LINE_PRE; do
|
||||
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
|
||||
echo " DOWNLOAD REFERENCED ATTACHMENT PDF"
|
||||
echo " $GREPATTACHMENTLINK"
|
||||
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
|
||||
_utils_download_helper "$ISREFPDF" "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$ITEM_YEAR/$ITEM_MONTH-$ITEM_DAY/Attachments/$PDFREFNAME"
|
||||
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
|
||||
fi
|
||||
elif (( ISATTACHMENTARTICLE )); then
|
||||
|
||||
577
SCRAPE_MEET.SH
Normal file → Executable file
577
SCRAPE_MEET.SH
Normal file → Executable file
@ -1,211 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-"
|
||||
echo -e "-=- SCRAPE_LONDON.SH: Downloads committee videos and agendas -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
conv_date() {
|
||||
echo "$1"
|
||||
MEETING_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
||||
MEETING_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
case "$MEETING_MONTH_WORD" in
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
conv_date_alt() {
|
||||
echo "$1"
|
||||
MEETING_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
|
||||
MEETING_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
|
||||
MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT)
|
||||
MEETING_YEAR=$(echo "$1" | sed 's/.* //')
|
||||
case "$MEETING_MONTH_WORD" in
|
||||
Jan*) MEETING_MONTH="01" ;;
|
||||
Feb*) MEETING_MONTH="02" ;;
|
||||
Mar*) MEETING_MONTH="03" ;;
|
||||
Apr*) MEETING_MONTH="04" ;;
|
||||
May) MEETING_MONTH="05" ;;
|
||||
Jun*) MEETING_MONTH="06" ;;
|
||||
Jul*) MEETING_MONTH="07" ;;
|
||||
Aug*) MEETING_MONTH="08" ;;
|
||||
Sep*) MEETING_MONTH="09" ;;
|
||||
Oct*) MEETING_MONTH="10" ;;
|
||||
Nov*) MEETING_MONTH="11" ;;
|
||||
Dec*) MEETING_MONTH="12" ;;
|
||||
*) MEETING_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
set_agenda_url() {
|
||||
case "$1" in
|
||||
'"Agenda (HTML)"')
|
||||
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda (PDF)"')
|
||||
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (HTML)"')
|
||||
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (PDF)"')
|
||||
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (HTML)"')
|
||||
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (PDF)"')
|
||||
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes with Attachments (PDF)"')
|
||||
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
|
||||
'"Agenda Full Package (HTML)"')
|
||||
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Full Package (PDF)"')
|
||||
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (HTML)"')
|
||||
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (PDF)"')
|
||||
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (HTML)"')
|
||||
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (PDF)"')
|
||||
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (HTML)"')
|
||||
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (PDF)"')
|
||||
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
esac
|
||||
}
|
||||
|
||||
clear_agenda_url() {
|
||||
AGENDA_HTML_URL=""
|
||||
AGENDA_PDF_URL=""
|
||||
AGENDA_REVISE_HTML_URL=""
|
||||
AGENDA_REVISE_PDF_URL=""
|
||||
MINUTES_HTML_URL=""
|
||||
MINUTES_PDF_URL=""
|
||||
MINUTES_ATTACH_PDF_URL=""
|
||||
|
||||
AGENDA_FULL_HTML_URL=""
|
||||
AGENDA_FULL_PDF_URL=""
|
||||
AGENDA_COVER_HTML_URL=""
|
||||
AGENDA_COVER_PDF_URL=""
|
||||
AGENDA_POST_HTML_URL=""
|
||||
AGENDA_POST_PDF_URL=""
|
||||
ADDENDUM_HTML_URL=""
|
||||
ADDENDUM_PDF_URL=""
|
||||
}
|
||||
|
||||
download_helper() {
|
||||
local url="$1"
|
||||
local out="$2"
|
||||
local code
|
||||
|
||||
code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
|
||||
case "$code" in
|
||||
200)
|
||||
echo "Downloaded."
|
||||
;;
|
||||
304)
|
||||
echo "Already exists! Skipping."
|
||||
;;
|
||||
*)
|
||||
echo "FAILED! $code: $out | $url" >&2
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
download_agendas() {
|
||||
local outdir="$1"
|
||||
|
||||
if [[ -n $AGENDA_REVISE_PDF_URL ]]; then
|
||||
echo "Saving revised agenda as PDF..."
|
||||
download_helper "$AGENDA_REVISE_PDF_URL" "$outdir/Agenda_Revised.pdf"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_PDF_URL ]]; then
|
||||
echo "Saving regular agenda as PDF..."
|
||||
download_helper "$AGENDA_PDF_URL" "$outdir/Agenda.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_REVISE_PDF_URL && -n $AGENDA_REVISE_HTML_URL ]]; then
|
||||
echo "Saving revised agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_REVISE_HTML_URL" "$outdir/Agenda_Revised.html"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_PDF_URL && -n $AGENDA_HTML_URL ]]; then
|
||||
echo "Saving regular agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_HTML_URL" "$outdir/Agenda.html"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_FULL_PDF_URL ]]; then
|
||||
echo "Saving full package agenda as PDF... (no HTML found!)"
|
||||
download_helper "$AGENDA_FULL_PDF_URL" "$outdir/Agenda_FullPackage.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_FULL_PDF_URL && -n $AGENDA_FULL_HTML_URL ]]; then
|
||||
echo "Saving full package agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_FULL_HTML_URL" "$outdir/Agenda_FullPackage.html"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_POST_PDF_URL ]]; then
|
||||
echo "Saving post agenda as PDF..."
|
||||
download_helper "$AGENDA_POST_PDF_URL" "$outdir/Agenda_Post.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_POST_PDF_URL && -n $AGENDA_POST_HTML_URL ]]; then
|
||||
echo "Saving post agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_POST_HTML_URL" "$outdir/Agenda_Post.html"
|
||||
fi
|
||||
|
||||
if [[ -n $MINUTES_ATTACH_PDF_URL ]]; then
|
||||
echo "Saving minutes with attachments as PDF..."
|
||||
download_helper "$MINUTES_ATTACH_PDF_URL" "$outdir/Minutes_With_Attachments.pdf"
|
||||
fi
|
||||
|
||||
if [[ -n $MINUTES_PDF_URL ]]; then
|
||||
echo "Saving minutes as PDF..."
|
||||
download_helper "$MINUTES_PDF_URL" "$outdir/Minutes.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $MINUTES_ATTACH_PDF_URL && -z $MINUTES_PDF_URL && -n $MINUTES_HTML_URL ]]; then
|
||||
echo "Saving minutes as HTML... (no PDF found!)"
|
||||
download_helper "$MINUTES_HTML_URL" "$outdir/Minutes.html"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_COVER_PDF_URL ]]; then
|
||||
echo "Saving cover agenda as PDF... (no HTML found!)"
|
||||
download_helper "$AGENDA_COVER_PDF_URL" "$outdir/Agenda_Cover.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_COVER_PDF_URL && -n $AGENDA_COVER_HTML_URL ]]; then
|
||||
echo "Saving cover agenda as HTML... (no PDF found!)"
|
||||
download_helper "$AGENDA_COVER_HTML_URL" "$outdir/Agenda_Cover.html"
|
||||
fi
|
||||
|
||||
if [[ -n $ADDENDUM_PDF_URL ]]; then
|
||||
echo "Saving addendum as PDF... (no HTML found!)"
|
||||
download_helper "$ADDENDUM_PDF_URL" "$outdir/Addendum.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $ADDENDUM_PDF_URL && -n $ADDENDUM_HTML_URL ]]; then
|
||||
echo "Saving addendum as HTML... (no PDF found!)"
|
||||
download_helper "$ADDENDUM_HTML_URL" "$outdir/Addendum.html"
|
||||
fi
|
||||
}
|
||||
source ./functions/.functions
|
||||
|
||||
# Warning to all who read this script:
|
||||
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
||||
@ -227,7 +29,7 @@ current_day=$(date +%d)
|
||||
SUPPORT_PAST=""
|
||||
|
||||
if [ -d "$TEMP_DIR" ]; then
|
||||
rm -r $TEMP_DIR
|
||||
rm -r $TEMP_DIR
|
||||
fi
|
||||
rm -f $INDEX_PAGE
|
||||
rm -f $SEARCH_PAGE
|
||||
@ -236,225 +38,210 @@ rm -f $AGENDA_HTML
|
||||
mkdir $TEMP_DIR
|
||||
|
||||
while IFS="," read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
||||
INDEX_URL=$(echo "$INDEX_URL_PRE" | sed 's/\"//g' | sed 's/,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||
CITY_ARCHIVE_NAME=$(echo "$CITY_ARCHIVE_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||
CALENDAR_NAME=$(echo "$CALENDAR_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||
INDEX_URL=$(echo "$INDEX_URL_PRE" | sed 's/\"//g' | sed 's/,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||
CITY_ARCHIVE_NAME=$(echo "$CITY_ARCHIVE_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||
CALENDAR_NAME=$(echo "$CALENDAR_NAME_PRE" | sed 's/\"//g' | sed 's/\,//g' | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//')
|
||||
INDEX_END="FALSE"
|
||||
while [[ $INDEX_END == "FALSE" ]]; do
|
||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --no-hsts --show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
FOUNDLIST="FALSE"
|
||||
while IFS= read -r LINE; do
|
||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||
if [[ "$GREPENDLIST" == "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: End of list."
|
||||
INDEX_END="TRUE"
|
||||
break
|
||||
else
|
||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||
echo "-========================================================================-"
|
||||
echo "- $MEETING_NAME"
|
||||
|
||||
INDEX_END="FALSE"
|
||||
while [[ $INDEX_END == "FALSE" ]]; do
|
||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --no-hsts --show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
FOUNDLIST="FALSE"
|
||||
while IFS= read -r LINE; do
|
||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||
if [[ "$GREPENDLIST" == "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: End of list."
|
||||
INDEX_END="TRUE"
|
||||
break
|
||||
else
|
||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||
echo "-========================================================================-"
|
||||
echo "- $MEETING_NAME"
|
||||
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
|
||||
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
|
||||
echo "- Corrected to: $MEETING_NAME"
|
||||
fi
|
||||
# Pages start at 1. Ew.
|
||||
x=1
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . >"${TEMP_DIR}escribe.json"
|
||||
#cat "${TEMP_DIR}escribe.json" > debug.json
|
||||
|
||||
if [[ "$MEETING_NAME" == "CANCELLED"* ]]; then
|
||||
MEETING_NAME=$(echo $MEETING_NAME | sed 's/^[^ ]* //' | sed 's/^[^ ]* //')
|
||||
echo "- Corrected to: $MEETING_NAME"
|
||||
fi
|
||||
# Pages start at 1. Ew.
|
||||
x=1
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
||||
#cat "${TEMP_DIR}escribe.json" > debug.json
|
||||
y=0
|
||||
i=0
|
||||
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
|
||||
while (true); do
|
||||
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length')
|
||||
|
||||
y=0
|
||||
i=0
|
||||
NUM_MEETINGS=$(cat "${TEMP_DIR}escribe.json" | jq '.d.TotalCount')
|
||||
while (true); do
|
||||
NUM_IN_JSON=$(cat "${TEMP_DIR}escribe.json" | jq '.d.Meetings | length' )
|
||||
if [[ "$NUM_IN_JSON" == "" ]]; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [[ "$NUM_IN_JSON" == "" ]]; then
|
||||
break
|
||||
fi
|
||||
# Decrease in the meeting count == we're on the final page.
|
||||
if (($i >= $NUM_IN_JSON)) && ((10#$NUM_IN_JSON >= 50)); then
|
||||
((x++))
|
||||
i=0
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . >"${TEMP_DIR}escribe.json"
|
||||
elif (($i >= 10#$NUM_IN_JSON)); then
|
||||
break
|
||||
fi
|
||||
|
||||
# Decrease in the meeting count == we're on the final page.
|
||||
if (( $i >= $NUM_IN_JSON )) && (( 10#$NUM_IN_JSON >= 50)); then
|
||||
((x++))
|
||||
i=0
|
||||
curl -s -d "{'type': '$MEETING_NAME', 'pageNumber': $x}" -H "Content-Type: application/json" -X POST "$INDEX_URL"MeetingsCalendarView.aspx/PastMeetings --insecure | jq . > "${TEMP_DIR}escribe.json"
|
||||
elif (( $i >= 10#$NUM_IN_JSON )); then
|
||||
break
|
||||
fi
|
||||
echo "$(($i + 1)) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
|
||||
|
||||
echo "$(( $i + 1 )) of $NUM_IN_JSON ($NUM_MEETINGS total) in page $x"
|
||||
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
|
||||
# No need to cat the entire file every time.
|
||||
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' >"${TEMP_DIR}escribe_short.json"
|
||||
|
||||
# Boost speed by extracting a single meeting from the large JSON, then working on the extract.
|
||||
# No need to cat the entire file every time.
|
||||
cat "${TEMP_DIR}escribe.json" | jq --argjson i "$i" '.d.Meetings.[$i]' > "${TEMP_DIR}escribe_short.json"
|
||||
#echo "> Meeting ID"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
|
||||
#echo "> Meeting Attachments"
|
||||
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
|
||||
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
|
||||
|
||||
#echo "> Meeting ID"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.Id'
|
||||
#echo "> Meeting Attachments"
|
||||
NUM_ATTACHMENTS=$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.MeetingLinks | length')
|
||||
# Will go in a loop, collecting links/types like with the earlier SCRAPE_MEET script.
|
||||
clear_agenda_url
|
||||
for ((j = 0; j <= (($NUM_ATTACHMENTS - 1)); j++)); do
|
||||
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
|
||||
done
|
||||
|
||||
clear_agenda_url
|
||||
for ((j=0; j<=(( $NUM_ATTACHMENTS - 1 )); j++)); do
|
||||
set_agenda_url "$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Title')" "$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" --argjson j "$j" '.MeetingLinks.[$j].Url')"
|
||||
done
|
||||
_time_parse_helper "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
|
||||
# "25 Feb 2026"
|
||||
if [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
|
||||
echo "Alternate date format."
|
||||
conv_date_alt "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
# "Feb 25 2026"
|
||||
elif [[ "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
|
||||
echo "Standard date format."
|
||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
else
|
||||
echo "COULD NOT FIGURE OUT DATE FORMAT!"
|
||||
conv_date "$(cat ${TEMP_DIR}escribe_short.json | jq --argjson i "$i" '.DateMedium' | sed 's/\"//g')"
|
||||
fi
|
||||
INPAST=""
|
||||
if ((10#$ITEM_YEAR >= 10#$current_year)) && ((10#$ITEM_MONTH >= $((10#$current_month - 1)))); then
|
||||
echo "NAME : $MEETING_NAME"
|
||||
echo "DATE : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||
echo "A (H) : $AGENDA_HTML_URL"
|
||||
echo "A (P) : $AGENDA_PDF_URL"
|
||||
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
|
||||
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
|
||||
echo "AF(H) : $AGENDA_FULL_HTML_URL"
|
||||
echo "AF(P) : $AGENDA_FULL_PDF_URL"
|
||||
echo "AC(H) : $AGENDA_COVER_HTML_URL"
|
||||
echo "AC(P) : $AGENDA_COVER_PDF_URL"
|
||||
echo "AP(H) : $AGENDA_POST_HTML_URL"
|
||||
echo "AP(P) : $AGENDA_POST_PDF_URL"
|
||||
echo "M (H) : $MINUTES_HTML_URL"
|
||||
echo "M (P) : $MINUTES_PDF_URL"
|
||||
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
|
||||
echo "AD(H) : $ADDENDUM_HTML_URL"
|
||||
echo "AD(P) : $ADDENDUM_PDF_URL"
|
||||
else
|
||||
echo "Dates are in the past!"
|
||||
echo "DATE : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||
INPAST="TRUE"
|
||||
fi
|
||||
|
||||
INPAST=""
|
||||
if (( 10#$MEETING_YEAR >= 10#$current_year )) && (( 10#$MEETING_MONTH >= $((10#$current_month - 1)) )); then
|
||||
echo "NAME : $MEETING_NAME"
|
||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
echo "A (H) : $AGENDA_HTML_URL"
|
||||
echo "A (P) : $AGENDA_PDF_URL"
|
||||
echo "AR(H) : $AGENDA_REVISE_HTML_URL"
|
||||
echo "AR(P) : $AGENDA_REVISE_PDF_URL"
|
||||
echo "AF(H) : $AGENDA_FULL_HTML_URL"
|
||||
echo "AF(P) : $AGENDA_FULL_PDF_URL"
|
||||
echo "AC(H) : $AGENDA_COVER_HTML_URL"
|
||||
echo "AC(P) : $AGENDA_COVER_PDF_URL"
|
||||
echo "AP(H) : $AGENDA_POST_HTML_URL"
|
||||
echo "AP(P) : $AGENDA_POST_PDF_URL"
|
||||
echo "M (H) : $MINUTES_HTML_URL"
|
||||
echo "M (P) : $MINUTES_PDF_URL"
|
||||
echo "MA(P) : $MINUTES_ATTACH_PDF_URL"
|
||||
echo "AD(H) : $ADDENDUM_HTML_URL"
|
||||
echo "AD(P) : $ADDENDUM_PDF_URL"
|
||||
else
|
||||
echo "Dates are in the past!"
|
||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||
INPAST="TRUE"
|
||||
fi
|
||||
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
||||
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
|
||||
echo "Abort."
|
||||
break
|
||||
fi
|
||||
|
||||
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
||||
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
|
||||
echo "Abort."
|
||||
break
|
||||
fi
|
||||
#echo "> Meeting Video"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
||||
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
||||
|
||||
#echo "> Meeting Video"
|
||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
||||
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
||||
ERROR="FALSE"
|
||||
ADDENDUM_ERROR="FALSE"
|
||||
echo "Downloading agenda HTML..."
|
||||
|
||||
ERROR="FALSE"
|
||||
ADDENDUM_ERROR="FALSE"
|
||||
echo "Downloading agenda HTML..."
|
||||
if [[ -n $AGENDA_REVISE_HTML_URL ]]; then
|
||||
_utils_download_helper "$AGENDA_REVISE_HTML_URL" "$AGENDA_HTML"
|
||||
elif [[ -n $AGENDA_HTML_URL ]]; then
|
||||
_utils_download_helper "$AGENDA_HTML_URL" "$AGENDA_HTML"
|
||||
|
||||
if [[ -n $AGENDA_REVISE_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_REVISE_HTML_URL" "$AGENDA_HTML"
|
||||
elif [[ -n $AGENDA_FULL_HTML_URL ]]; then
|
||||
_utils_download_helper "$AGENDA_FULL_HTML_URL" "$AGENDA_HTML"
|
||||
|
||||
elif [[ -n $AGENDA_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_HTML_URL" "$AGENDA_HTML"
|
||||
elif [[ -n $AGENDA_POST_HTML_URL ]]; then
|
||||
_utils_download_helper "$AGENDA_POST_HTML_URL" "$AGENDA_HTML"
|
||||
|
||||
elif [[ -n $AGENDA_FULL_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_FULL_HTML_URL" "$AGENDA_HTML"
|
||||
elif [[ -n $AGENDA_COVER_HTML_URL ]]; then
|
||||
_utils_download_helper "$AGENDA_COVER_HTML_URL" "$AGENDA_HTML"
|
||||
else
|
||||
ERROR="TRUE"
|
||||
fi
|
||||
|
||||
elif [[ -n $AGENDA_POST_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_POST_HTML_URL" "$AGENDA_HTML"
|
||||
if [[ -n $ADDENDUM_HTML_URL ]]; then
|
||||
_utils_download_helper "$ADDENDUM_HTML_URL" "$ADDENDUM_HTML"
|
||||
else
|
||||
ADDENDUM_ERROR="TRUE"
|
||||
fi
|
||||
|
||||
elif [[ -n $AGENDA_COVER_HTML_URL ]]; then
|
||||
download_helper "$AGENDA_COVER_HTML_URL" "$AGENDA_HTML"
|
||||
if [[ "$ERROR" == "FALSE" ]]; then
|
||||
|
||||
else
|
||||
ERROR="TRUE"
|
||||
fi
|
||||
mkdir "./$CITY_ARCHIVE_NAME"
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
||||
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
|
||||
fi
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$ITEM_YEAR" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$ITEM_YEAR/"
|
||||
fi
|
||||
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$ITEM_YEAR" "$ITEM_MONTH" "$ITEM_DAY")
|
||||
if [ ! -d "$MEETING_DIR" ]; then
|
||||
mkdir "$MEETING_DIR/"
|
||||
fi
|
||||
if [ ! -d "$MEETING_DIR/Attachments" ]; then
|
||||
mkdir "$MEETING_DIR/Attachments/"
|
||||
fi
|
||||
|
||||
if [[ -n $ADDENDUM_HTML_URL ]]; then
|
||||
download_helper "$ADDENDUM_HTML_URL" "$ADDENDUM_HTML"
|
||||
else
|
||||
ADDENDUM_ERROR="TRUE"
|
||||
fi
|
||||
if [[ $VIDEO_URL != "" ]]; then
|
||||
echo "Saving recording URL..."
|
||||
echo "https://video.isilive.ca/london/"$VIDEO_URL >"$MEETING_DIR/RecordingLink.txt"
|
||||
fi
|
||||
|
||||
if [[ "$ERROR" == "FALSE" ]]; then
|
||||
# Get attachment links
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' >"./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' >"./tmp/attachment_names"
|
||||
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
|
||||
# Get attachment links
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' >"./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' >"./tmp/attachment_names"
|
||||
fi
|
||||
# Download attachment and use the name grabbed above
|
||||
echo "Found the following agenda attachments:"
|
||||
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
||||
echo "- $LINEA2 / $LINEA1"
|
||||
_utils_download_helper "$INDEX_URL$LINEA1" "$MEETING_DIR/Attachments/$LINEA2"
|
||||
# [ ! -s "$MEETING_DIR/Attachments/$LINEA2" ] && rm -f "$MEETING_DIR/Attachments/$LINEA2"
|
||||
done < ./tmp/attachment_urls 3<./tmp/attachment_names
|
||||
echo "All attachments saved."
|
||||
|
||||
mkdir "./$CITY_ARCHIVE_NAME"
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
||||
download_agendas "$MEETING_DIR"
|
||||
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/"
|
||||
fi
|
||||
if [ ! -d "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then
|
||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings/$MEETING_NAME/$MEETING_YEAR/"
|
||||
fi
|
||||
MEETING_DIR=$(printf "./$CITY_ARCHIVE_NAME/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY")
|
||||
if [ ! -d "$MEETING_DIR" ]; then
|
||||
mkdir "$MEETING_DIR/"
|
||||
fi
|
||||
if [ ! -d "$MEETING_DIR/Attachments" ]; then
|
||||
mkdir "$MEETING_DIR/Attachments/"
|
||||
fi
|
||||
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
|
||||
echo "dir not empty" >>/dev/null
|
||||
else
|
||||
rm -r "$MEETING_DIR/Attachments"
|
||||
fi
|
||||
|
||||
if [[ $VIDEO_URL != "" ]]; then
|
||||
echo "Saving recording URL..."
|
||||
echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt"
|
||||
fi
|
||||
echo "All files from this meeting have been saved."
|
||||
find "$MEETING_DIR" -type f -size 0 -delete
|
||||
echo "Cleaning PDFs for archive.org..."
|
||||
find "$MEETING_DIR" -type f -name '*.pdf' -print0 | xargs -0 -n1 qpdf --replace-input
|
||||
# qpdf repairs and leaves garbage original PDFs
|
||||
find "$MEETING_DIR" -type f -name '*~qpdf-orig' -delete -print
|
||||
fi
|
||||
|
||||
# Get attachment links
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
|
||||
# Get attachment links
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/^.*AgendaHeaderTitle/AgendaHeaderTitle/' | sed 's/href=.[Ff]ile[Ss]tream\.ashx/\nhref="filestream\.ashx/g' | grep -i 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^"]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||
# Get attachment names
|
||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed -E "s/data-original-title=['\"]/\\ndata-original-title='/g" | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||
fi
|
||||
# Download attachment and use the name grabbed above
|
||||
echo "Found the following agenda attachments:"
|
||||
while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do
|
||||
echo "- $LINEA2 / $LINEA1"
|
||||
download_helper "$INDEX_URL$LINEA1" "$MEETING_DIR/Attachments/$LINEA2"
|
||||
# [ ! -s "$MEETING_DIR/Attachments/$LINEA2" ] && rm -f "$MEETING_DIR/Attachments/$LINEA2"
|
||||
done < ./tmp/attachment_urls 3< ./tmp/attachment_names
|
||||
echo "All attachments saved."
|
||||
((i++))
|
||||
((y++))
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
download_agendas "$MEETING_DIR"
|
||||
|
||||
if find "$MEETING_DIR/Attachments" -mindepth 1 -maxdepth 1 | read; then
|
||||
echo "dir not empty" >> /dev/null
|
||||
else
|
||||
rm -r "$MEETING_DIR/Attachments"
|
||||
fi
|
||||
|
||||
echo "All files from this meeting have been saved."
|
||||
find "$MEETING_DIR" -type f -size 0 -delete
|
||||
echo "Cleaning PDFs for archive.org..."
|
||||
find "$MEETING_DIR" -type f -name '*.pdf' -print0 | xargs -0 -n1 qpdf --replace-input
|
||||
# qpdf repairs and leaves garbage original PDFs
|
||||
find "$MEETING_DIR" -type f -name '*~qpdf-orig' -delete -print
|
||||
fi
|
||||
|
||||
((i++))
|
||||
((y++))
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||
if [[ "$GREPLIST" != "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||
FOUNDLIST="TRUE"
|
||||
fi
|
||||
done < $INDEX_PAGE
|
||||
else
|
||||
INDEX_END="TRUE"
|
||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||
fi
|
||||
done
|
||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||
if [[ "$GREPLIST" != "" ]]; then
|
||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||
FOUNDLIST="TRUE"
|
||||
fi
|
||||
done < $INDEX_PAGE
|
||||
else
|
||||
INDEX_END="TRUE"
|
||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||
fi
|
||||
done
|
||||
done < websites.csv
|
||||
|
||||
47
SCRAPE_MPaS.SH
Executable file
47
SCRAPE_MPaS.SH
Executable file
@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env bash
|
||||
echo -e "\n-========================================================================-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- SCRAPE_MPaS.SH: Scrape London Master Plans and Strategies -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-=- Lillian Skinner -=-"
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
source ./functions/.functions
|
||||
|
||||
# London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person!
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
TEMP_DIR="./tmp/"
|
||||
SEARCH_PAGE="./tmp/index_mpas.html"
|
||||
|
||||
rm -f $SEARCH_PAGE
|
||||
|
||||
mkdir $TEMP_DIR
|
||||
|
||||
SEARCH_URL="https://london.ca/government/council-civic-administration/master-plans-strategies/plans-strategies"
|
||||
|
||||
wget --user-agent="$WGET_UA" "$SEARCH_URL" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||
if [ $? -ne 8 ]; then
|
||||
cat "$SEARCH_PAGE" | sed 's/></>\n</g' | \
|
||||
while IFS= read -r LINE; do
|
||||
if (( LAST_LINE_SUMMARY )) && [[ "$LINE" == "<span>"* ]]; then
|
||||
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<span>\([^<]*\)<\/span>.*/\1/')
|
||||
echo $CURRENT
|
||||
fi
|
||||
|
||||
LAST_LINE_SUMMARY=0
|
||||
if echo "$LINE" | grep -q '<summary>'; then
|
||||
LAST_LINE_SUMMARY=1
|
||||
CURRENT=$(echo "$LINE" | sed 's/></>\n</g' | sed 's/.*<summary>\([^<]*\)<\/summary>.*/\1/')
|
||||
echo $CURRENT
|
||||
fi
|
||||
|
||||
if [[ "$LINE" == *".pdf"* ]]; then
|
||||
FILE="$(echo "$LINE" | grep -o 'href="[^"]*\.pdf"' | sed 's/^href="//; s/"$//; s#^https://london\.ca##' | sed 's/%20/ /g' | sed 's/%27//g')" # Fix stupid sublime syntax highlighting: '
|
||||
echo $FILE
|
||||
mkdir -p "./LondonArchive/Master Plans and Strategies/$CURRENT/"
|
||||
_utils_download_helper "https://london.ca$FILE" "./LondonArchive/Master Plans and Strategies/$CURRENT/$(basename "$FILE")"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
25
SCRAPE_OPEN.SH
Normal file → Executable file
25
SCRAPE_OPEN.SH
Normal file → Executable file
@ -8,6 +8,8 @@ echo -e "-=- Lillian Skinner
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
source ./functions/.functions
|
||||
|
||||
WORKDIR="./tmp"
|
||||
STAGEDIR="./staging"
|
||||
DOCDIR="./LondonArchive/OpenData"
|
||||
@ -52,19 +54,25 @@ while [[ $SEARCH_END == 0 ]]; do
|
||||
echo " Cur. article: $i.$j, URL : $ITEM_URL"
|
||||
echo " Cur. article: $i.$j, Name : $ITEM_NAME"
|
||||
|
||||
rm -rf $STAGEDIR
|
||||
mkdir $STAGEDIR
|
||||
#rm -rf $STAGEDIR
|
||||
#mkdir $STAGEDIR
|
||||
|
||||
if [[ $ITEM_NAME != "" ]] && [[ $ITEM_NAME != "null" ]]; then
|
||||
_utils_download_helper "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" "$DOCDIR/$ITEM_NAME"
|
||||
wget --user-agent="$WGET_UA" "https://www.arcgis.com/sharing/rest/content/items/$ITEM_ID/data" -O "$DOCDIR/$ITEM_NAME" -c -q
|
||||
echo " Downloaded."
|
||||
|
||||
echo "Compressing."
|
||||
echo "(Not) Compressing."
|
||||
# No need to compress non-map data.
|
||||
#7z a "$DOCDIR/$ITEM_NAME.7z" "$STAGEDIR"
|
||||
fi
|
||||
|
||||
# This section is depracated. Use SCRAPE_AGIS.SH instead.
|
||||
if [[ $ITEM_URL == *"maps.london.ca/server/rest/services"* ]] && (( DOWNLOAD_MAPS )); then
|
||||
MAPDIR_ITEM=$(echo "$MAPDIR/$ITEM_TITLE")
|
||||
mkdir -p "$MAPDIR_ITEM"
|
||||
echo "Item: $MAPDIR_ITEM"
|
||||
|
||||
MAP_ID="$(echo $ITEM_URL | sed 's/^.*\/MapServer\///')"
|
||||
echo " ^^^ Item is map. ($MAP_ID) "
|
||||
# https://hub.arcgis.com/api/v3/datasets/$ITEM_ID/downloads/data?format=[csv/shp/geojson/kml]&spatialRefId=$SPATIAL_ID&where=1=1
|
||||
@ -74,21 +82,22 @@ while [[ $SEARCH_END == 0 ]]; do
|
||||
MAP_GEO="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=geojson&spatialRefId=4326&where=1=1"
|
||||
MAP_KML="https://hub.arcgis.com/api/v3/datasets/${ITEM_ID}_${MAP_ID}/downloads/data?format=kml&spatialRefId=4326&where=1=1"
|
||||
echo " Map URL (CSV) : $MAP_CSV"
|
||||
wget --user-agent="$WGET_UA" "$MAP_CSV" -O "$STAGEDIR/$ITEM_TITLE.csv" -c -q
|
||||
_utils_download_helper "$MAP_CSV" "$MAPDIR_ITEM/$ITEM_TITLE.csv"
|
||||
echo " Downloaded."
|
||||
echo " Map URL (Shapefile): $MAP_SHP"
|
||||
wget --user-agent="$WGET_UA" "$MAP_SHP" -O "$STAGEDIR/$ITEM_TITLE.shp" -c -q
|
||||
_utils_download_helper "$MAP_SHP" "$MAPDIR_ITEM/$ITEM_TITLE.shp"
|
||||
echo " Downloaded."
|
||||
echo " Map URL (GeoJSON) : $MAP_GEO"
|
||||
wget --user-agent="$WGET_UA" "$MAP_GEO" -O "$STAGEDIR/$ITEM_TITLE.geojson" -c -q
|
||||
_utils_download_helper "$MAP_GEO" "$MAPDIR_ITEM/$ITEM_TITLE.geojson"
|
||||
echo " Downloaded."
|
||||
echo " Map URL (KML) : $MAP_KML"
|
||||
wget --user-agent="$WGET_UA" "$MAP_KML" -O "$STAGEDIR/$ITEM_TITLE.kml" -c -q
|
||||
_utils_download_helper "$MAP_KML" "$MAPDIR_ITEM/$ITEM_TITLE.kml"
|
||||
echo " Downloaded."
|
||||
echo ' Source URL is $ITEM_URL.'
|
||||
|
||||
echo "Compressing."
|
||||
7z a "$MAPDIR/$ITEM_TITLE.7z" "$STAGEDIR"
|
||||
rm -f "$MAPDIR_ITEM/$ITEM_TITLE.7z"
|
||||
7z a "$MAPDIR_ITEM/$ITEM_TITLE.7z" "$MAPDIR_ITEM"
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
58
SCRAPE_PLAN.SH
Normal file → Executable file
58
SCRAPE_PLAN.SH
Normal file → Executable file
@ -7,49 +7,7 @@ echo -e "-=- Lillian Skinner
|
||||
echo -e "-=- -=-"
|
||||
echo -e "-========================================================================-"
|
||||
|
||||
conv_date_plan() {
|
||||
PROJECT_TIME_YEAR=$(echo $1 | sed 's/.*\([0-9]\{4\}\).*/\1/p' | uniq)
|
||||
PROJECT_TIME_MONTH_WORD=$(echo $1 | sed 's/.*,\s*\([A-Za-z]*\)\s[0-9]\{1,2\},.*/\1/p' | uniq)
|
||||
PROJECT_TIME_DAY_SHORT=$(echo $1 | sed 's/.*,\s*[A-Za-z]*\s\([0-9]\{1,2\}\),.*/\1/p' | uniq)
|
||||
PROJECT_TIME_DAY=$(printf "%02d" $PROJECT_TIME_DAY_SHORT)
|
||||
case "$PROJECT_TIME_MONTH_WORD" in
|
||||
Jan*) PROJECT_TIME_MONTH="01" ;;
|
||||
Feb*) PROJECT_TIME_MONTH="02" ;;
|
||||
Mar*) PROJECT_TIME_MONTH="03" ;;
|
||||
Apr*) PROJECT_TIME_MONTH="04" ;;
|
||||
May) PROJECT_TIME_MONTH="05" ;;
|
||||
Jun*) PROJECT_TIME_MONTH="06" ;;
|
||||
Jul*) PROJECT_TIME_MONTH="07" ;;
|
||||
Aug*) PROJECT_TIME_MONTH="08" ;;
|
||||
Sep*) PROJECT_TIME_MONTH="09" ;;
|
||||
Oct*) PROJECT_TIME_MONTH="10" ;;
|
||||
Nov*) PROJECT_TIME_MONTH="11" ;;
|
||||
Dec*) PROJECT_TIME_MONTH="12" ;;
|
||||
*) PROJECT_TIME_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
conv_date() {
|
||||
MODIFIED_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MODIFIED_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
MODIFIED_DAY=$(printf "%02d" $MODIFIED_DAY_SHORT)
|
||||
MODIFIED_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
case "$MODIFIED_MONTH_WORD" in
|
||||
Jan*) MODIFIED_MONTH="01" ;;
|
||||
Feb*) MODIFIED_MONTH="02" ;;
|
||||
Mar*) MODIFIED_MONTH="03" ;;
|
||||
Apr*) MODIFIED_MONTH="04" ;;
|
||||
May) MODIFIED_MONTH="05" ;;
|
||||
Jun*) MODIFIED_MONTH="06" ;;
|
||||
Jul*) MODIFIED_MONTH="07" ;;
|
||||
Aug*) MODIFIED_MONTH="08" ;;
|
||||
Sep*) MODIFIED_MONTH="09" ;;
|
||||
Oct*) MODIFIED_MONTH="10" ;;
|
||||
Nov*) MODIFIED_MONTH="11" ;;
|
||||
Dec*) MODIFIED_MONTH="12" ;;
|
||||
*) MODIFIED_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
source ./functions/.functions
|
||||
|
||||
# Warning to all who read this script:
|
||||
# It is bad. I know it is bad, but I am tired okay, and sometimes sloppy just works.
|
||||
@ -107,11 +65,11 @@ while (( ! SEARCH_END )); do
|
||||
PROJECT_NAME=$(cat $PROJECT_PAGE | grep "page-title" | grep "field--name-title" | sed 's/.*<span[^>]*>\([^<]*\)<[\/:-]span>.*/\1/p' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/^COVID-19//p' | uniq | tr -d '\r' | tr -d '\n' | tr '/' '-')
|
||||
echo " Found project: $PROJECT_NAME"
|
||||
|
||||
MODIFIED_MONTH=""
|
||||
MODIFIED_YEAR=""
|
||||
conv_date "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
|
||||
if (( 10#$MODIFIED_YEAR >= 10#$current_year )) && (( 10#$MODIFIED_MONTH >= $((10#$current_month - 1)) )); then
|
||||
echo "Last Modified: $MODIFIED_YEAR/$MODIFIED_MONTH/$MODIFIED_DAY"
|
||||
ITEM_MONTH=""
|
||||
ITEM_YEAR=""
|
||||
_time_parse_helper "$(cat "$PROJECT_PAGE" | grep "Last modified:" | sed 's/.*<\/span>//' | sed 's/<\/div>.*//' | sed 's/^[^, ]*, //' | grep -E '[0-9]{4}')"
|
||||
if (( 10#$ITEM_YEAR >= 10#$current_year )) && (( 10#$ITEM_MONTH >= $((10#$current_month - 1)) )); then
|
||||
echo "Last Modified: $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||
else
|
||||
echo "Dates are in the past! Abort."
|
||||
break
|
||||
@ -208,8 +166,8 @@ while (( ! SEARCH_END )); do
|
||||
fi
|
||||
PROJECT_FOUND_TIME=$(echo $PLINE | grep "datetime")
|
||||
if [[ $PROJECT_FOUND_TIME != "" ]]; then
|
||||
conv_date_plan "$PLINE"
|
||||
echo "Found date : $PROJECT_TIME_YEAR/$PROJECT_TIME_MONTH/$PROJECT_TIME_DAY"
|
||||
_time_parse_helper "$(echo $PLINE | sed 's/.*<time[^>]*>\([^<]*\)<[\/:-]time>.*/\1/g' | cut -d, -f2- | cut -d\ -f2-)"
|
||||
echo "Found date : $ITEM_YEAR/$ITEM_MONTH/$ITEM_DAY"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
9
functions/.functions
Normal file
9
functions/.functions
Normal file
@ -0,0 +1,9 @@
|
||||
sdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# General
|
||||
source "$sdir/.functions.time"
|
||||
source "$sdir/.functions.utils"
|
||||
|
||||
# Script specific
|
||||
source "$sdir/.functions.filepro"
|
||||
source "$sdir/.functions.escribe"
|
||||
133
functions/.functions.escribe
Normal file
133
functions/.functions.escribe
Normal file
@ -0,0 +1,133 @@
|
||||
set_agenda_url() {
|
||||
case "$1" in
|
||||
'"Agenda (HTML)"')
|
||||
AGENDA_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda (PDF)"')
|
||||
AGENDA_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (HTML)"')
|
||||
AGENDA_REVISE_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Revised Agenda (PDF)"')
|
||||
AGENDA_REVISE_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (HTML)"')
|
||||
MINUTES_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes (PDF)"')
|
||||
MINUTES_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Minutes with Attachments (PDF)"')
|
||||
MINUTES_ATTACH_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
|
||||
'"Agenda Full Package (HTML)"')
|
||||
AGENDA_FULL_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Full Package (PDF)"')
|
||||
AGENDA_FULL_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (HTML)"')
|
||||
AGENDA_COVER_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Agenda Cover Page (PDF)"')
|
||||
AGENDA_COVER_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (HTML)"')
|
||||
AGENDA_POST_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Post Agenda (PDF)"')
|
||||
AGENDA_POST_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (HTML)"')
|
||||
ADDENDUM_HTML_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
'"Addendum (PDF)"')
|
||||
ADDENDUM_PDF_URL=$(echo $2 | sed 's/\"//g') ;;
|
||||
esac
|
||||
}
|
||||
|
||||
clear_agenda_url() {
|
||||
AGENDA_HTML_URL=""
|
||||
AGENDA_PDF_URL=""
|
||||
AGENDA_REVISE_HTML_URL=""
|
||||
AGENDA_REVISE_PDF_URL=""
|
||||
MINUTES_HTML_URL=""
|
||||
MINUTES_PDF_URL=""
|
||||
MINUTES_ATTACH_PDF_URL=""
|
||||
|
||||
AGENDA_FULL_HTML_URL=""
|
||||
AGENDA_FULL_PDF_URL=""
|
||||
AGENDA_COVER_HTML_URL=""
|
||||
AGENDA_COVER_PDF_URL=""
|
||||
AGENDA_POST_HTML_URL=""
|
||||
AGENDA_POST_PDF_URL=""
|
||||
ADDENDUM_HTML_URL=""
|
||||
ADDENDUM_PDF_URL=""
|
||||
}
|
||||
|
||||
download_agendas() {
|
||||
local outdir="$1"
|
||||
|
||||
if [[ -n $AGENDA_REVISE_PDF_URL ]]; then
|
||||
echo "Saving revised agenda as PDF..."
|
||||
_utils_download_helper "$AGENDA_REVISE_PDF_URL" "$outdir/Agenda_Revised.pdf"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_PDF_URL ]]; then
|
||||
echo "Saving regular agenda as PDF..."
|
||||
_utils_download_helper "$AGENDA_PDF_URL" "$outdir/Agenda.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_REVISE_PDF_URL && -n $AGENDA_REVISE_HTML_URL ]]; then
|
||||
echo "Saving revised agenda as HTML... (no PDF found!)"
|
||||
_utils_download_helper "$AGENDA_REVISE_HTML_URL" "$outdir/Agenda_Revised.html"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_PDF_URL && -n $AGENDA_HTML_URL ]]; then
|
||||
echo "Saving regular agenda as HTML... (no PDF found!)"
|
||||
_utils_download_helper "$AGENDA_HTML_URL" "$outdir/Agenda.html"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_FULL_PDF_URL ]]; then
|
||||
echo "Saving full package agenda as PDF... (no HTML found!)"
|
||||
_utils_download_helper "$AGENDA_FULL_PDF_URL" "$outdir/Agenda_FullPackage.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_FULL_PDF_URL && -n $AGENDA_FULL_HTML_URL ]]; then
|
||||
echo "Saving full package agenda as HTML... (no PDF found!)"
|
||||
_utils_download_helper "$AGENDA_FULL_HTML_URL" "$outdir/Agenda_FullPackage.html"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_POST_PDF_URL ]]; then
|
||||
echo "Saving post agenda as PDF..."
|
||||
_utils_download_helper "$AGENDA_POST_PDF_URL" "$outdir/Agenda_Post.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_POST_PDF_URL && -n $AGENDA_POST_HTML_URL ]]; then
|
||||
echo "Saving post agenda as HTML... (no PDF found!)"
|
||||
_utils_download_helper "$AGENDA_POST_HTML_URL" "$outdir/Agenda_Post.html"
|
||||
fi
|
||||
|
||||
if [[ -n $MINUTES_ATTACH_PDF_URL ]]; then
|
||||
echo "Saving minutes with attachments as PDF..."
|
||||
_utils_download_helper "$MINUTES_ATTACH_PDF_URL" "$outdir/Minutes_With_Attachments.pdf"
|
||||
fi
|
||||
|
||||
if [[ -n $MINUTES_PDF_URL ]]; then
|
||||
echo "Saving minutes as PDF..."
|
||||
_utils_download_helper "$MINUTES_PDF_URL" "$outdir/Minutes.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $MINUTES_ATTACH_PDF_URL && -z $MINUTES_PDF_URL && -n $MINUTES_HTML_URL ]]; then
|
||||
echo "Saving minutes as HTML... (no PDF found!)"
|
||||
_utils_download_helper "$MINUTES_HTML_URL" "$outdir/Minutes.html"
|
||||
fi
|
||||
|
||||
if [[ -n $AGENDA_COVER_PDF_URL ]]; then
|
||||
echo "Saving cover agenda as PDF... (no HTML found!)"
|
||||
_utils_download_helper "$AGENDA_COVER_PDF_URL" "$outdir/Agenda_Cover.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $AGENDA_COVER_PDF_URL && -n $AGENDA_COVER_HTML_URL ]]; then
|
||||
echo "Saving cover agenda as HTML... (no PDF found!)"
|
||||
_utils_download_helper "$AGENDA_COVER_HTML_URL" "$outdir/Agenda_Cover.html"
|
||||
fi
|
||||
|
||||
if [[ -n $ADDENDUM_PDF_URL ]]; then
|
||||
echo "Saving addendum as PDF... (no HTML found!)"
|
||||
_utils_download_helper "$ADDENDUM_PDF_URL" "$outdir/Addendum.pdf"
|
||||
fi
|
||||
|
||||
if [[ -z $ADDENDUM_PDF_URL && -n $ADDENDUM_HTML_URL ]]; then
|
||||
echo "Saving addendum as HTML... (no PDF found!)"
|
||||
_utils_download_helper "$ADDENDUM_HTML_URL" "$outdir/Addendum.html"
|
||||
fi
|
||||
}
|
||||
34
functions/.functions.filepro
Normal file
34
functions/.functions.filepro
Normal file
@ -0,0 +1,34 @@
|
||||
_filepro_download_folder() {
|
||||
if [ "$#" -eq 0 ]; then
|
||||
echo "Usage: <input string>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local tmp_index
|
||||
tmp_index=$(mktemp)
|
||||
local tmp_dir
|
||||
tmp_dir="$1"
|
||||
|
||||
local LINE
|
||||
local LINE_ID
|
||||
local LINE_TITLE
|
||||
local LINE_TYPE
|
||||
|
||||
wget --no-check-certificate --user-agent="$WGET_UA" "$2" -O "$tmp_index" --no-hsts -q
|
||||
echo "Looking in folder $3/$LINE_ID"
|
||||
echo "Download to $tmp_dir/"
|
||||
while IFS= read -r LINE; do
|
||||
LINE_ID=$(echo $LINE | sed 's/.*data-id="\([^"]*\)".*/\1/g')
|
||||
LINE_TITLE=$(echo $LINE | sed 's/.*data-title="\([^"]*\)".*/\1/g' | sed 's/&/\&/g' | sed 's/'/'\''/g' | sed 's/'/'\''/g')
|
||||
LINE_TYPE=$(echo $LINE | sed 's/.*data-type="\([^"]*\)".*/\1/g')
|
||||
if [[ "$LINE_TYPE" == "document" ]]; then
|
||||
echo "Found document: $LINE_ID : $LINE_TITLE.pdf... downloading..."
|
||||
mkdir -p "$tmp_dir"
|
||||
_utils_download_helper "${START_URL}/document/$LINE_ID" "$tmp_dir/$LINE_TITLE.pdf"
|
||||
elif [[ "$LINE_TYPE" == "folder" ]]; then
|
||||
_filepro_download_folder "$tmp_dir/$LINE_TITLE" "${START_URL}/filepro/documents/$LINE_ID" "$3/$LINE_ID"
|
||||
fi
|
||||
done < "$tmp_index"
|
||||
|
||||
rm -f $tmp_index
|
||||
}
|
||||
71
functions/.functions.time
Normal file
71
functions/.functions.time
Normal file
@ -0,0 +1,71 @@
|
||||
_time_parse_helper() {
|
||||
if [ "$#" -eq 0 ]; then
|
||||
echo "Usage: <date>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo $1
|
||||
|
||||
if [[ "$(echo $1 | sed 's/\"//g' | sed 's/,//')" =~ ^([0-9]{1,2})[[:space:]]+(.+)[[:space:]]+([0-9]{4})$ ]]; then
|
||||
_time_parse_ddmonyyyy "$1"
|
||||
elif [[ "$(echo $1 | sed 's/\"//g' | sed 's/,//')" =~ ^(.+)[[:space:]]+([0-9]{1,2})[[:space:]]+([0-9]{4})$ ]]; then
|
||||
_time_parse_monddyyyy "$1"
|
||||
else
|
||||
echo "COULD NOT FIGURE OUT DATE FORMAT!"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
_time_parse_monddyyyy() {
|
||||
if [ "$#" -eq 0 ]; then
|
||||
echo "Usage: <date in mon dd yyyy>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ITEM_MONTH_WORD=$(echo "$1" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
ITEM_DAY_SHORT=$(echo "$1" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
ITEM_DAY=$(printf "%02d" $ITEM_DAY_SHORT)
|
||||
ITEM_YEAR=$(echo "$1" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//')
|
||||
case "$ITEM_MONTH_WORD" in
|
||||
Jan*) ITEM_MONTH="01" ;;
|
||||
Feb*) ITEM_MONTH="02" ;;
|
||||
Mar*) ITEM_MONTH="03" ;;
|
||||
Apr*) ITEM_MONTH="04" ;;
|
||||
May) ITEM_MONTH="05" ;;
|
||||
Jun*) ITEM_MONTH="06" ;;
|
||||
Jul*) ITEM_MONTH="07" ;;
|
||||
Aug*) ITEM_MONTH="08" ;;
|
||||
Sep*) ITEM_MONTH="09" ;;
|
||||
Oct*) ITEM_MONTH="10" ;;
|
||||
Nov*) ITEM_MONTH="11" ;;
|
||||
Dec*) ITEM_MONTH="12" ;;
|
||||
*) ITEM_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
_time_parse_ddmonyyyy() {
|
||||
if [ "$#" -eq 0 ]; then
|
||||
echo "Usage: <date in dd mon yyyy>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ITEM_MONTH_WORD=$(echo "$1" | sed 's/^[^ ]* //' | sed 's/ .*//')
|
||||
ITEM_DAY_SHORT=$(echo "$1" | sed 's/ .*//')
|
||||
ITEM_DAY=$(printf "%02d" $ITEM_DAY_SHORT)
|
||||
ITEM_YEAR=$(echo "$1" | sed 's/.* //')
|
||||
case "$ITEM_MONTH_WORD" in
|
||||
Jan*) ITEM_MONTH="01" ;;
|
||||
Feb*) ITEM_MONTH="02" ;;
|
||||
Mar*) ITEM_MONTH="03" ;;
|
||||
Apr*) ITEM_MONTH="04" ;;
|
||||
May) ITEM_MONTH="05" ;;
|
||||
Jun*) ITEM_MONTH="06" ;;
|
||||
Jul*) ITEM_MONTH="07" ;;
|
||||
Aug*) ITEM_MONTH="08" ;;
|
||||
Sep*) ITEM_MONTH="09" ;;
|
||||
Oct*) ITEM_MONTH="10" ;;
|
||||
Nov*) ITEM_MONTH="11" ;;
|
||||
Dec*) ITEM_MONTH="12" ;;
|
||||
*) ITEM_MONTH="--" ;;
|
||||
esac
|
||||
}
|
||||
104
functions/.functions.utils
Normal file
104
functions/.functions.utils
Normal file
@ -0,0 +1,104 @@
|
||||
_utils_ocrmypdf() {
|
||||
if [ "$#" -eq 0 ]; then
|
||||
echo "Usage: <in.pdf> <out.pdf>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# https://stackoverflow.com/questions/7997399/bash-script-to-check-pdfs-are-ocrd
|
||||
MYFONTS=$(pdffonts -l 5 "$1" | tail -n +3 | cut -d' ' -f1 | sort | uniq)
|
||||
if [ "$MYFONTS" = '' ] || [ "$MYFONTS" = '[none]' ]; then
|
||||
echo "NOT OCRed yet. Working..."
|
||||
else
|
||||
echo "$1 is already OCRed. Saving as is."
|
||||
cp "$1" "$2"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
in="$1"
|
||||
out="$2"
|
||||
tmp=$(mktemp -d) || return 1
|
||||
|
||||
pdfseparate "$in" "$tmp/page-%04d.pdf" || return 1
|
||||
i=0
|
||||
for page in "$tmp"/page-*.pdf; do
|
||||
img="$tmp/img-$i.png"
|
||||
qpdf --replace-input --rotate=0:1-z "$page"
|
||||
pdftoppm -singlefile -r 300 -png -cropbox "$page" "$tmp/img-$i" || return 1
|
||||
# Checks rotations. Annoying way to do it but whatever.
|
||||
rotation=$(tesseract "$img" stdout --psm 0 2>/dev/null | awk -F': ' '/Rotate/ {print $2}')
|
||||
case "$rotation" in
|
||||
180) convert "$img" -rotate 180 "$img" ;;
|
||||
90) convert "$img" -rotate 90 "$img" ;;
|
||||
270) convert "$img" -rotate 270 "$img";;
|
||||
esac
|
||||
ocrmypdf \
|
||||
--skip-text \
|
||||
--clean \
|
||||
--optimize 1 \
|
||||
--jobs 1 \
|
||||
"$img" "$tmp/ocr-$i-tmp.pdf" || return 1
|
||||
|
||||
case "$rotation" in
|
||||
90) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
|
||||
270) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
|
||||
esac
|
||||
mv "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf"
|
||||
|
||||
i=$((i+1))
|
||||
done
|
||||
|
||||
pdfunite $(ls -v ${tmp}/ocr*.pdf) "$out" || return 1
|
||||
}
|
||||
|
||||
_utils_fix_dashes() {
|
||||
if [ "$#" -eq 0 ]; then
|
||||
echo "Usage: <input string>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
perl -CSDA -MURI::Escape -MUnicode::Normalize -e '
|
||||
binmode STDOUT, ":utf8";
|
||||
my $s = shift // "";
|
||||
my $prev;
|
||||
do { $prev = $s; $s = uri_unescape($s); } while ($s ne $prev);
|
||||
$s = NFKC($s);
|
||||
$s =~ tr/\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}/-/;
|
||||
$s =~ s/&/and/g;
|
||||
$s =~ tr/\x{2018}\x{2019}\x{201B}/\x27/;
|
||||
$s =~ tr/\x{201C}\x{201D}/"/;
|
||||
$s =~ tr/\x{00A0}/ /;
|
||||
$s =~ s/[\x{200B}\x{200C}\x{200D}\x{FEFF}]//g;
|
||||
$s =~ s/\s+/ /g;
|
||||
$s =~ s/^\s+|\s+$//g;
|
||||
$s =~ s/\s+(\.[^. ]+)$/$1/;
|
||||
print $s;
|
||||
' "$1"
|
||||
}
|
||||
|
||||
_utils_download_helper() {
|
||||
if [ "$#" -eq 0 ]; then
|
||||
echo "Usage: <url> <outfile>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
||||
|
||||
local url="$1"
|
||||
local out="$2"
|
||||
local code
|
||||
|
||||
code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
|
||||
case "$code" in
|
||||
200)
|
||||
echo "Downloaded."
|
||||
;;
|
||||
304)
|
||||
echo "Already exists! Skipping."
|
||||
;;
|
||||
*)
|
||||
echo "FAILED! $code: $out | $url" >&2
|
||||
rm -f "$out"
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
83
template/default_getinvolved.html
Normal file
83
template/default_getinvolved.html
Normal file
@ -0,0 +1,83 @@
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
td, h1, h2, h3, p, b, div, i, span, label, ul, li, tr, table { page-break-inside: avoid; }
|
||||
body {
|
||||
width: 90%;
|
||||
min-width: 600px;
|
||||
position: relative;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
color: #666;
|
||||
font-size: 16px;
|
||||
font-family: Frutiger,"Helvetica Neue",Helvetica,Arial,sans-serif;
|
||||
font-weight: 300;
|
||||
}
|
||||
strong {
|
||||
font-weight: 700;
|
||||
}
|
||||
p {
|
||||
color: #000000;
|
||||
}
|
||||
.h1, .h2, .h3, .h4, .h5, .h6, .post-teaser.featured .post-title, h1, h2, h3, h4, h5, h6 {
|
||||
font-family: Gnuolane,"Helvetica Neue",Helvetica,Arial,sans-serif;
|
||||
font-weight: 700;
|
||||
line-height: 1.1;
|
||||
color: #087ac0;
|
||||
}
|
||||
|
||||
table {
|
||||
display: table;
|
||||
margin-bottom: 2em;
|
||||
min-width: 100%;
|
||||
border-spacing: 0;
|
||||
border-collapse: collapse;
|
||||
border-color: #ccc;
|
||||
background-color: transparent;
|
||||
line-height: 1.5;
|
||||
}
|
||||
.table-responsive {
|
||||
overflow-x: auto;
|
||||
}
|
||||
table tbody {
|
||||
display: table-row-group;
|
||||
vertical-align: top;
|
||||
border-color: inherit;
|
||||
}
|
||||
table tbody > tr:nth-of-type(2n+1) {
|
||||
background-color: #f2f2f2;
|
||||
}
|
||||
table tr {
|
||||
display: table-row;
|
||||
vertical-align: inherit;
|
||||
border-color: inherit;
|
||||
}
|
||||
table tbody > tr > td, table tbody > tr > th, table thead > tr > th {
|
||||
padding: 8px;
|
||||
border: 1px solid #ccc;
|
||||
vertical-align: top;
|
||||
}
|
||||
table td {
|
||||
display: table-cell;
|
||||
border: 1px solid #ccc;
|
||||
}
|
||||
ol, ul {
|
||||
margin-top: 0;
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
ol, ul {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
ol li {
|
||||
padding-left: 10px;
|
||||
}
|
||||
ol li, ul li {
|
||||
padding-bottom: 12px;
|
||||
}
|
||||
address, dd, dt, li, p {
|
||||
line-height: 1.5;
|
||||
}
|
||||
img {
|
||||
max-width: 100% !important;
|
||||
height: auto !important;
|
||||
}
|
||||
</style>
|
||||
0
websites.csv
Normal file → Executable file
0
websites.csv
Normal file → Executable file
Loading…
Reference in New Issue
Block a user