Upload files to "/"
This commit is contained in:
parent
0793fe0036
commit
58c637e051
@ -17,8 +17,7 @@ YOU MUST HAVE `websites.csv` FOR ALL ESCRIBE SCRAPERS!
|
|||||||
|
|
||||||
## Scrape eScribe meetings (SCRAPE_MEET.SH)
|
## Scrape eScribe meetings (SCRAPE_MEET.SH)
|
||||||
|
|
||||||
This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST="TRUE"`, meetings older than 2 months will be downloaded. Otherwise, they will be skipped.
|
This bash script will scrape meetings from the eScribe meetings platform. There is a variable set called `SUPPORT_PAST`. If `SUPPORT_PAST=1` (true), meetings older than 2 months will be downloaded. Otherwise, they will be skipped.
|
||||||
Don't ask why "TRUE" is a string and not a boolean...
|
|
||||||
|
|
||||||
The basic structure of the output files is:
|
The basic structure of the output files is:
|
||||||
```
|
```
|
||||||
|
|||||||
@ -38,18 +38,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||||
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||||
|
|
||||||
INDEX_END="FALSE"
|
INDEX_END=0
|
||||||
while [[ $INDEX_END == "FALSE" ]]; do
|
while (( ! INDEX_END )); do
|
||||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
||||||
if [ $? -ne 8 ]; then
|
if [ $? -ne 8 ]; then
|
||||||
FOUNDLIST="FALSE"
|
FOUNDLIST=0
|
||||||
while IFS= read -r LINE; do
|
while IFS= read -r LINE; do
|
||||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
if (( FOUNDLIST )); then
|
||||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||||
if [[ "$GREPENDLIST" == "" ]]; then
|
if [[ "$GREPENDLIST" == "" ]]; then
|
||||||
echo "SCRAPE_ESCRIBE: End of list."
|
echo "SCRAPE_ESCRIBE: End of list."
|
||||||
INDEX_END="TRUE"
|
INDEX_END=1
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||||
@ -88,11 +88,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||||
if [[ "$GREPLIST" != "" ]]; then
|
if [[ "$GREPLIST" != "" ]]; then
|
||||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||||
FOUNDLIST="TRUE"
|
FOUNDLIST=1
|
||||||
fi
|
fi
|
||||||
done < $INDEX_PAGE
|
done < $INDEX_PAGE
|
||||||
else
|
else
|
||||||
INDEX_END="TRUE"
|
INDEX_END=1
|
||||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|||||||
@ -112,11 +112,11 @@ while IFS= read -r LINE_PRE; do
|
|||||||
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
GREPLINK=$(echo $LINE_HTML | grep "<a href" | sed 's/.*<a href=.\([^<]*\)">.*/\1/' | sed 's/".*//')
|
||||||
if [[ "$GREPARTICLESTART" != "" ]]; then
|
if [[ "$GREPARTICLESTART" != "" ]]; then
|
||||||
echo " FOUND INDEX ARTICLE START"
|
echo " FOUND INDEX ARTICLE START"
|
||||||
ISARTICLE="TRUE"
|
ISARTICLE=1
|
||||||
elif [[ "$GREPARTICLEEND" != "" ]]; then
|
elif [[ "$GREPARTICLEEND" != "" ]]; then
|
||||||
echo " END OF INDEX ARTICLE"
|
echo " END OF INDEX ARTICLE"
|
||||||
ISARTICLE=""
|
ISARTICLE=0
|
||||||
elif [[ "$GREPLINK" != "" ]] && [[ "$ISARTICLE" != "" ]]; then
|
elif [[ "$GREPLINK" != "" ]] && (( ISARTICLE )); then
|
||||||
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
|
mkdir "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments" 2> /dev/null
|
||||||
ISPDF=$(echo $GREPLINK | grep "\.pdf")
|
ISPDF=$(echo $GREPLINK | grep "\.pdf")
|
||||||
if [[ "$ISPDF" != "" ]]; then
|
if [[ "$ISPDF" != "" ]]; then
|
||||||
@ -141,14 +141,14 @@ while IFS= read -r LINE_PRE; do
|
|||||||
# CSS for the HTML is in the default template
|
# CSS for the HTML is in the default template
|
||||||
cat ./template/default.html > ./tmp/new.html
|
cat ./template/default.html > ./tmp/new.html
|
||||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||||
ISATTACHMENTARTICLE="TRUE"
|
ISATTACHMENTARTICLE=1
|
||||||
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
|
elif [[ "$GREPATTACHMENTARTICLEEND" != "" ]]; then
|
||||||
echo " END OF ATTACHMENT ARTICLE"
|
echo " END OF ATTACHMENT ARTICLE"
|
||||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||||
echo " PROCESSED TO PDF"
|
echo " PROCESSED TO PDF"
|
||||||
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
|
wkhtmltopdf ./tmp/new.html "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$ATTACHTITLE.pdf" 2> /dev/null
|
||||||
ISATTACHMENTARTICLE=""
|
ISATTACHMENTARTICLE=0
|
||||||
elif [[ "$GREPATTACHMENTLINK" != "" ]] && [[ "$ISATTACHMENTARTICLE" != "" ]]; then
|
elif [[ "$GREPATTACHMENTLINK" != "" ]] && (( ISATTACHMENTARTICLE )); then
|
||||||
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
|
ISREFPDF=$(echo $GREPATTACHMENTLINK | grep "\.pdf")
|
||||||
if [[ "$ISREFPDF" != "" ]]; then
|
if [[ "$ISREFPDF" != "" ]]; then
|
||||||
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
|
PDFREFNAME=$(echo $ISREFPDF | sed 's/.*\///')
|
||||||
@ -157,7 +157,7 @@ while IFS= read -r LINE_PRE; do
|
|||||||
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
|
wget --user-agent="$WGET_UA" "$ISREFPDF" -O "./LondonArchive/LTC/$COMMITTEENAME_SLUG/$MEETING_YEAR/$MEETING_MONTH-$MEETING_DAY/Attachments/$PDFREFNAME" -c -q #--show-progress
|
||||||
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
|
echo "<ul><li>$PDFREFNAME</li></ul>" >> ./tmp/new.html
|
||||||
fi
|
fi
|
||||||
elif [[ "$ISATTACHMENTARTICLE" != "" ]]; then
|
elif (( ISATTACHMENTARTICLE )); then
|
||||||
echo "$LINE_ATTACH" >> ./tmp/new.html
|
echo "$LINE_ATTACH" >> ./tmp/new.html
|
||||||
fi
|
fi
|
||||||
LINE_ATTACH=""
|
LINE_ATTACH=""
|
||||||
|
|||||||
@ -198,7 +198,7 @@ current_year=$(date +%Y)
|
|||||||
current_month=$(date +%m)
|
current_month=$(date +%m)
|
||||||
current_day=$(date +%d)00
|
current_day=$(date +%d)00
|
||||||
|
|
||||||
SUPPORT_PAST="TRUE"
|
SUPPORT_PAST=1
|
||||||
|
|
||||||
if [ -d "$TEMP_DIR" ]; then
|
if [ -d "$TEMP_DIR" ]; then
|
||||||
rm -r $TEMP_DIR
|
rm -r $TEMP_DIR
|
||||||
@ -214,18 +214,18 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
CITY_ARCHIVE_NAME=$(echo $CITY_ARCHIVE_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||||
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
CALENDAR_NAME=$(echo $CALENDAR_NAME_PRE | sed 's/\"//g' | sed 's/\,//g')
|
||||||
|
|
||||||
INDEX_END="FALSE"
|
INDEX_END=0
|
||||||
while [[ $INDEX_END == "FALSE" ]]; do
|
while (( ! INDEX_END )); do
|
||||||
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
echo "SCRAPE_ESCRIBE: Downloading eScribe index..."
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
wget --no-check-certificate --user-agent="$WGET_UA" $INDEX_URL -O $INDEX_PAGE --show-progress
|
||||||
if [ $? -ne 8 ]; then
|
if [ $? -ne 8 ]; then
|
||||||
FOUNDLIST="FALSE"
|
FOUNDLIST=0
|
||||||
while IFS= read -r LINE; do
|
while IFS= read -r LINE; do
|
||||||
if [[ "TRUE" == $FOUNDLIST ]]; then
|
if (( FOUNDLIST )); then
|
||||||
GREPENDLIST=$(echo $LINE | grep '<option ')
|
GREPENDLIST=$(echo $LINE | grep '<option ')
|
||||||
if [[ "$GREPENDLIST" == "" ]]; then
|
if [[ "$GREPENDLIST" == "" ]]; then
|
||||||
echo "SCRAPE_ESCRIBE: End of list."
|
echo "SCRAPE_ESCRIBE: End of list."
|
||||||
INDEX_END="TRUE"
|
INDEX_END=1
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
MEETING_NAME=$(echo $LINE | sed 's/.*<option[^>]*>\([^<]*\)<[\/:-]option>.*/\1/g')
|
||||||
@ -312,11 +312,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
else
|
else
|
||||||
echo "Dates are in the past!"
|
echo "Dates are in the past!"
|
||||||
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
echo "DATE : $MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY"
|
||||||
INPAST="TRUE"
|
INPAST=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
# I think "break" broke when I did nested loops. idk I'm too drunk for this.
|
||||||
if [[ "$INPAST" == "TRUE" ]] && [[ "$SUPPORT_PAST" != "TRUE" ]]; then
|
if (( INPAST )) && (( ! SUPPORT_PAST )); then
|
||||||
echo "Abort."
|
echo "Abort."
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
@ -325,8 +325,8 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
#cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].HasVideo'
|
||||||
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
VIDEOURL="$INDEX_URL$(cat "${TEMP_DIR}escribe_short.json" | jq --argjson i "$i" '.VideoLink.[].Url' | sed 's/\"//g')"
|
||||||
|
|
||||||
ERROR="FALSE"
|
ERROR=0
|
||||||
ADDENDUM_ERROR="FALSE"
|
ADDENDUM_ERROR=0
|
||||||
echo "Downloading agenda HTML..."
|
echo "Downloading agenda HTML..."
|
||||||
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
|
if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||||
@ -339,16 +339,16 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
|
elif [[ $AGENDA_COVER_HTML_URL != "" ]]; then
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
wget --no-check-certificate --user-agent="$WGET_UA" "$AGENDA_COVER_HTML_URL" -O $AGENDA_HTML -q #--show-progress
|
||||||
else
|
else
|
||||||
ERROR="TRUE"
|
ERROR=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $ADDENDUM_HTML_URL != "" ]]; then
|
if [[ $ADDENDUM_HTML_URL != "" ]]; then
|
||||||
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress
|
wget --no-check-certificate --user-agent="$WGET_UA" "$ADDENDUM_HTML_URL" -O $ADDENDUM_HTML -q #--show-progress
|
||||||
else
|
else
|
||||||
ADDENDUM_ERROR="TRUE"
|
ADDENDUM_ERROR=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$ERROR" == "FALSE" ]]; then
|
if (( ! ERROR )); then
|
||||||
|
|
||||||
mkdir "./$CITY_ARCHIVE_NAME"
|
mkdir "./$CITY_ARCHIVE_NAME"
|
||||||
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
mkdir "./$CITY_ARCHIVE_NAME/Meetings"
|
||||||
@ -376,7 +376,7 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||||
# Get attachment names
|
# Get attachment names
|
||||||
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names"
|
||||||
if [[ "$ADDENDUM_ERROR" == "FALSE" ]]; then
|
if (( ! ADDENDUM_ERROR )); then
|
||||||
# Get attachment links
|
# Get attachment links
|
||||||
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
cat $ADDENDUM_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls"
|
||||||
# Get attachment names
|
# Get attachment names
|
||||||
@ -410,11 +410,11 @@ while read -r INDEX_URL_PRE CITY_ARCHIVE_NAME_PRE CALENDAR_NAME_PRE; do
|
|||||||
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
GREPLIST=$(echo $LINE | grep 'class="MeetingTypeListbox"')
|
||||||
if [[ "$GREPLIST" != "" ]]; then
|
if [[ "$GREPLIST" != "" ]]; then
|
||||||
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
echo "SCRAPE_ESCRIBE: Found meeting type list."
|
||||||
FOUNDLIST="TRUE"
|
FOUNDLIST=1
|
||||||
fi
|
fi
|
||||||
done < $INDEX_PAGE
|
done < $INDEX_PAGE
|
||||||
else
|
else
|
||||||
INDEX_END="TRUE"
|
INDEX_END=1
|
||||||
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
echo "SCRAPE_ESCRIBE: Couldn't save index!"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|||||||
@ -81,8 +81,8 @@ mkdir $TEMP_DIR
|
|||||||
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
|
SEARCH_URL="https://london.ca/business-development/planning-development-applications/planning-applications"
|
||||||
|
|
||||||
j=0
|
j=0
|
||||||
SEARCH_END="FALSE"
|
SEARCH_END=0
|
||||||
while [[ $SEARCH_END == "FALSE" ]]; do
|
while (( ! SEARCH_END )); do
|
||||||
echo "-========================================================================-"
|
echo "-========================================================================-"
|
||||||
echo "Downloading search results... Page $j"
|
echo "Downloading search results... Page $j"
|
||||||
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
wget --user-agent="$WGET_UA" $SEARCH_URL"?page=$j" -O $SEARCH_PAGE --timestamping -q #--show-progress
|
||||||
@ -124,8 +124,8 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
|||||||
rm -f $PROJECT_IMAGE_NAMES
|
rm -f $PROJECT_IMAGE_NAMES
|
||||||
|
|
||||||
while IFS= read -r PLINE; do
|
while IFS= read -r PLINE; do
|
||||||
if [[ "$NEXT_LINE_FITEM" == "TRUE" ]]; then
|
if (( NEXT_LINE_FITEM )); then
|
||||||
NEXT_LINE_FITEM="FALSE"
|
NEXT_LINE_FITEM=0
|
||||||
|
|
||||||
# Is this line an actual item?
|
# Is this line an actual item?
|
||||||
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
|
PROJECT_INFO_IS_ITEMS=$(echo $PLINE | grep "field__items")
|
||||||
@ -215,8 +215,8 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$NEXT_LINE_IMAGE" == "TRUE" ]]; then
|
if (( NEXT_LINE_IMAGE )); then
|
||||||
NEXT_LINE_IMAGE="FALSE"
|
NEXT_LINE_IMAGE=0
|
||||||
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
|
PROJECT_IMAGE_URL=$(echo $PLINE | sed 's/.*<img[^>]*src="\([^"]*\)".*/\1/p' | sed 's/?.*//' | uniq)
|
||||||
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
|
PROJECT_IMAGE_URL_SHORT=$(echo $PLINE | grep "https://london.ca")
|
||||||
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
|
if [[ $PROJECT_IMAGE_URL_SHORT == "" ]];then
|
||||||
@ -243,14 +243,14 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
|||||||
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
|
PROJECT_FOUND_FLABEL=$(echo $PLINE | grep "field__label")
|
||||||
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
|
if [[ "$PROJECT_FOUND_FLABEL" != "" ]]; then
|
||||||
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
|
PROJECT_INFO_LABEL=$(echo $PLINE | sed 's/.*<div class="field__label">\(<time[^>]*>\)\?\([^<]*\).*/\2/p' | uniq)
|
||||||
NEXT_LINE_FITEM="TRUE"
|
NEXT_LINE_FITEM=1
|
||||||
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
|
# Info boxes will always have a label on one line, then the contents in the next. (except for contact info lol)
|
||||||
# We're setting a flag to let the script know if an upcoming line is contents.
|
# We're setting a flag to let the script know if an upcoming line is contents.
|
||||||
fi
|
fi
|
||||||
|
|
||||||
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
|
PROJECT_FOUND_IMAGE=$(echo $PLINE | grep "field__label visually-hidden" | grep "Image")
|
||||||
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
|
if [[ "$PROJECT_FOUND_IMAGE" != "" ]]; then
|
||||||
NEXT_LINE_IMAGE="TRUE"
|
NEXT_LINE_IMAGE=1
|
||||||
# Same idea as before but for the image shown on the main page.
|
# Same idea as before but for the image shown on the main page.
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -340,11 +340,11 @@ while [[ $SEARCH_END == "FALSE" ]]; do
|
|||||||
fi
|
fi
|
||||||
done < $SEARCH_PAGE
|
done < $SEARCH_PAGE
|
||||||
else
|
else
|
||||||
SEARCH_END="TRUE"
|
SEARCH_END=1
|
||||||
echo "No more pages!"
|
echo "No more pages!"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
SEARCH_END="TRUE"
|
SEARCH_END=1
|
||||||
echo "No more pages!"
|
echo "No more pages!"
|
||||||
fi
|
fi
|
||||||
((j++))
|
((j++))
|
||||||
|
|||||||
35
websites.csv
35
websites.csv
@ -1,34 +1,3 @@
|
|||||||
"https://pub-brampton.escribemeetings.com/", "SubBramptonArchive", ""
|
"https://pub-london.escribemeetings.com/", "LondonArchive", ""
|
||||||
"https://pub-markham.escribemeetings.com/", "SubMarkhamArchive", ""
|
"https://pub-stthomas.escribemeetings.com/", "StThomasArchive", ""
|
||||||
"https://pub-cityofkingston.escribemeetings.com/", "SubKingstonArchive", ""
|
|
||||||
"https://pub-barrie.escribemeetings.com/", "SubBarrieArchive", ""
|
|
||||||
"https://pub-oshawa.escribemeetings.com/", "SubOshawaArchive", ""
|
|
||||||
"https://pub-ottawa.escribemeetings.com/", "OttawaArchive", ""
|
"https://pub-ottawa.escribemeetings.com/", "OttawaArchive", ""
|
||||||
"https://pub-owensound.escribemeetings.com/", "SubOwenSoundArchive", ""
|
|
||||||
"https://pub-goderich.escribemeetings.com/", "SubGoderichArchive", ""
|
|
||||||
"https://pub-oakville.escribemeetings.com/", "SubOakvilleArchive", ""
|
|
||||||
"https://burlingtonpublishing.escribemeetings.com/", "SubBurlingtonArchive", ""
|
|
||||||
"https://pub-milton.escribemeetings.com/", "SubMiltonArchive", ""
|
|
||||||
"https://pub-durhamregion.escribemeetings.com/", "SubDurhamArchive", ""
|
|
||||||
"https://pub-richmondhill.escribemeetings.com/", "SubRichmondHillArchive", ""
|
|
||||||
"https://pub-whitby.escribemeetings.com/", "SubWhitbyArchive", ""
|
|
||||||
"https://pub-london.escribemeetings.com/", "LondonArchive", "London Meetings"
|
|
||||||
"https://pub-middlesexcounty.escribemeetings.com/", "SubMiddlesexCountyArchive", ""
|
|
||||||
"https://pub-lucanbiddulph.escribemeetings.com/", "SubLucanBiddulphArchive", ""
|
|
||||||
"https://pub-thamescentre.escribemeetings.com/", "SubThamesCentreArchive", ""
|
|
||||||
"https://pub-stthomas.escribemeetings.com/", "SubStThomasArchive", ""
|
|
||||||
"https://pub-northmiddlesex.escribemeetings.com/", "SubNorthMiddlesexArchive", ""
|
|
||||||
"https://pub-strathroy-caradoc.escribemeetings.com/", "SubStrathroyCaradocArchive", ""
|
|
||||||
"https://pub-adelaidemetcalfe.escribemeetings.com/", "SubAdelaideMetcalfeArchive", ""
|
|
||||||
"https://pub-middlesexcentre.escribemeetings.com/", "SubMiddsexCentreArchive", ""
|
|
||||||
"https://pub-mississauga.escribemeetings.com/", "SubMississaugaArchive", ""
|
|
||||||
"https://pub-guelph.escribemeetings.com/", "SubGuelphArchive", ""
|
|
||||||
"https://pub-regionofwaterloo.escribemeetings.com/", "SubWaterlooArchive", ""
|
|
||||||
"https://pub-kitchener.escribemeetings.com/", "SubKitchenerArchive", ""
|
|
||||||
"https://pub-hamilton.escribemeetings.com/", "SubHamiltonArchive", ""
|
|
||||||
"https://pub-brantford.escribemeetings.com/", "SubBrantfordArchive", ""
|
|
||||||
"https://pub-woodstock.escribemeetings.com/", "SubWoodstockArchive", ""
|
|
||||||
"https://pub-stratford.escribemeetings.com/", "SubStratfordArchive", ""
|
|
||||||
"https://pub-chatham-kent.escribemeetings.com/", "SubChathamKentArchive", ""
|
|
||||||
"https://pub-cambridge.escribemeetings.com/", "SubCambridgeArchive", ""
|
|
||||||
"https://pub-vaughan.escribemeetings.com/", "SubVaughanArchive", ""
|
|
||||||
|
|||||||
|
Loading…
Reference in New Issue
Block a user