Geocode addresses with Google Geocoding API

And cache the results.
This commit is contained in:
ezwelty 2024-02-21 16:13:23 +01:00 committed by Ethan Welty
parent ddd75864e4
commit 3105ecfbc2
6 changed files with 150 additions and 9 deletions

4
.env.example Normal file
View File

@ -0,0 +1,4 @@
GOOGLE_MAPS_API_KEY=
DATA_ARCHIVE=archive
DATA_ARCHIVE_LOG=archive.jsonl
GEOCODE_ARCHIVE=geocode

2
.gitignore vendored
View File

@ -16,3 +16,5 @@
/taxa*/
archive
archive.jsonl
geocode
.env

View File

@ -1,3 +1,4 @@
require('dotenv').config()
const fs = require('fs')
const mime = require('mime-types')
const puppeteer = require('puppeteer')
@ -9,10 +10,6 @@ const streamPipeline = util.promisify(stream.pipeline)
const helpers = require('./helpers')
// NOTE: Requires execution from project root
ARCHIVE_PATH = 'archive'
LOG_PATH = 'archive.jsonl'
/**
* Download file and compute MD5 hash of the stream.
*
@ -147,7 +144,7 @@ async function hashFile(file, options = {encoding: 'base64'}) {
function buildPath({url, checksum, date = new Date()} = {}) {
const hash = url ? md5(url) : checksum
date = date.toISOString().replace(/:/g, '')
return path.join(ARCHIVE_PATH, hash, date)
return path.join(process.env.DATA_ARCHIVE, hash, date)
}
/**
@ -172,7 +169,7 @@ async function log({ date = new Date(), ...props } = {}) {
entry.path = existingEntry.path
}
}
fs.appendFileSync(LOG_PATH, JSON.stringify(entry) + '\n')
fs.appendFileSync(process.env.DATA_ARCHIVE_LOG, JSON.stringify(entry) + '\n')
return entry
}
@ -258,7 +255,7 @@ async function search(params, {limit, maxDays} = {}) {
}
const criterias = Object.entries(params || {})
const entries = []
for await (const log of helpers.iterateJSONL(LOG_PATH)) {
for await (const log of helpers.iterateJSONL(process.env.DATA_ARCHIVE_LOG)) {
if (limit && entries.length === limit) {
return entries
}
@ -274,8 +271,6 @@ async function search(params, {limit, maxDays} = {}) {
}
module.exports = {
ARCHIVE_PATH,
LOG_PATH,
md5,
loadPage,
readPageHtml,

59
lib/geocode.js Normal file
View File

@ -0,0 +1,59 @@
require('dotenv').config()
const fs = require('fs')
const {Client} = require('@googlemaps/google-maps-services-js')
const client = new Client()
const archive = require('./archive')
/**
* Geocode address.
*
* @param {string} address - Address to geocode.
* @returns {Promise<object>} Geocode results.
*/
async function geocode(address) {
let response
try {
response = await client.geocode({
params: { address, key: process.env.GOOGLE_MAPS_API_KEY},
timeout: 1000, // milliseconds
})
} catch (error) {
throw new Error(error.response.data.error_message)
}
return response.data.results
}
function buildGeocodePath(address) {
const hash = archive.md5(address)
return `${process.env.GEOCODE_ARCHIVE}/${hash}.json`
}
/**
* Geocode address with caching.
*
* @param {string} address - Address to geocode.
* @returns {Promise<object>} Geocode results.
*/
async function geocodeCached(address) {
const path = buildGeocodePath(address)
let data
if (fs.existsSync(path)) {
const txt = fs.readFileSync(path, 'utf8')
data = JSON.parse(txt)
if (data.address !== address) {
throw new Error(
`Address mismatch in ${path}: ${data.address} !== ${address}`
)
}
} else {
const result = await geocode(address)
data = {address, date: new Date(), result}
fs.writeFileSync(path, JSON.stringify(data))
}
return data
}
module.exports = {
geocode,
geocodeCached,
}

View File

@ -22,6 +22,7 @@
"@esri/arcgis-rest-form-data": "^4.0.0",
"@esri/arcgis-rest-portal": "^4.0.3",
"@esri/arcgis-rest-request": "^4.0.3",
"@googlemaps/google-maps-services-js": "^3.3.42",
"adm-zip": "^0.5.10",
"axios": "^1.4.0",
"colors": "^1.1.2",
@ -29,6 +30,7 @@
"command-line-usage": "^6.1.0",
"csv-parser": "^2.3.2",
"decompress": "^4.2.1",
"dotenv": "^16.3.1",
"fuzzyset.js": "^1.0.5",
"gdal-async": "^3.7.0",
"glob": "^7.1.2",

View File

@ -337,6 +337,24 @@
resolved "https://registry.yarnpkg.com/@gar/promisify/-/promisify-1.1.3.tgz#555193ab2e3bb3b6adc3d551c9c030d9e860daf6"
integrity sha512-k2Ty1JcVojjJFwrg/ThKi2ujJ7XNLYaFGNB/bWT9wGR+oSMJHMa5w+CUq6p/pVrKeNNgA7pCqEcjSnHVoqJQFw==
"@googlemaps/google-maps-services-js@^3.3.42":
version "3.3.42"
resolved "https://registry.yarnpkg.com/@googlemaps/google-maps-services-js/-/google-maps-services-js-3.3.42.tgz#61b3ba9045c84a29c1ffeca1d571ce56b316eca3"
integrity sha512-DfqM28z0jSMr0BSw+CUcUPJLwwMhMf1f+IWfFYuPs6o/AqyYUN+jLjBQKfaUh69e8MShYM4LzcNBYjyttYtsmA==
dependencies:
"@googlemaps/url-signature" "^1.0.4"
agentkeepalive "^4.1.0"
axios "^1.5.1"
query-string "<8.x"
retry-axios "<3.x"
"@googlemaps/url-signature@^1.0.4":
version "1.0.30"
resolved "https://registry.yarnpkg.com/@googlemaps/url-signature/-/url-signature-1.0.30.tgz#6f82bd504c39a691628bfa66fc568e6937af29ed"
integrity sha512-iT2Ju1t+DiAloAZa3wsRPRNIxxtCPN0v9iRCzlMM/7U3NQlmiIcTWoY6+OY3ZFZUYE1a7Z0kC6AmtbYVtSILCQ==
dependencies:
crypto-js "^4.2.0"
"@hapi/address@^2.1.2":
version "2.1.4"
resolved "https://registry.npmjs.org/@hapi/address/-/address-2.1.4.tgz"
@ -837,6 +855,13 @@ agent-base@6, agent-base@^6.0.2:
dependencies:
debug "4"
agentkeepalive@^4.1.0:
version "4.5.0"
resolved "https://registry.yarnpkg.com/agentkeepalive/-/agentkeepalive-4.5.0.tgz#2673ad1389b3c418c5a20c5d7364f93ca04be923"
integrity sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==
dependencies:
humanize-ms "^1.2.1"
agentkeepalive@^4.1.3:
version "4.3.0"
resolved "https://registry.yarnpkg.com/agentkeepalive/-/agentkeepalive-4.3.0.tgz#bb999ff07412653c1803b3ced35e50729830a255"
@ -991,6 +1016,15 @@ axios@^1.4.0:
form-data "^4.0.0"
proxy-from-env "^1.1.0"
axios@^1.5.1:
version "1.6.2"
resolved "https://registry.yarnpkg.com/axios/-/axios-1.6.2.tgz#de67d42c755b571d3e698df1b6504cde9b0ee9f2"
integrity sha512-7i24Ri4pmDRfJTR7LDBhsOTtcm+9kjX5WiY1X3wIisx6G9So3pfMkEiU7emUBe46oceVImccTEM3k6C5dbVW8A==
dependencies:
follow-redirects "^1.15.0"
form-data "^4.0.0"
proxy-from-env "^1.1.0"
babel-jest@^28.1.3:
version "28.1.3"
resolved "https://registry.yarnpkg.com/babel-jest/-/babel-jest-28.1.3.tgz#c1187258197c099072156a0a121c11ee1e3917d5"
@ -1486,6 +1520,11 @@ cross-spawn@^7.0.3:
shebang-command "^2.0.0"
which "^2.0.1"
crypto-js@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/crypto-js/-/crypto-js-4.2.0.tgz#4d931639ecdfd12ff80e8186dba6af2c2e856631"
integrity sha512-KALDyEYgpY+Rlob/iriUtjV6d5Eq+Y191A5g4UqLAi8CyGP9N1+FdVbkc1SxKc2r4YAYqG8JzO2KGL+AizD70Q==
csv-parser@^2.3.2:
version "2.3.2"
resolved "https://registry.npmjs.org/csv-parser/-/csv-parser-2.3.2.tgz"
@ -1510,6 +1549,11 @@ debug@4, debug@4.3.4, debug@^4.1.0, debug@^4.1.1, debug@^4.3.3:
dependencies:
ms "2.1.2"
decode-uri-component@^0.2.2:
version "0.2.2"
resolved "https://registry.yarnpkg.com/decode-uri-component/-/decode-uri-component-0.2.2.tgz#e69dbe25d37941171dd540e024c444cd5188e1e9"
integrity sha512-FqUYQ+8o158GyGTrMFJms9qh3CqTKvAqgqsTnkLI8sKu0028orqBhxNMFkFen0zGyg6epACD32pjVk58ngIErQ==
decompress-tar@^4.0.0, decompress-tar@^4.1.0, decompress-tar@^4.1.1:
version "4.1.1"
resolved "https://registry.npmjs.org/decompress-tar/-/decompress-tar-4.1.1.tgz"
@ -1631,6 +1675,11 @@ dmd@^4.0.5:
test-value "^3.0.0"
walk-back "^4.0.0"
dotenv@^16.3.1:
version "16.3.1"
resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-16.3.1.tgz#369034de7d7e5b120972693352a3bf112172cc3e"
integrity sha512-IPzF4w4/Rd94bA9imS68tZBaYyBWSCE47V1RGuMrB94iyTOIEwRmVL2x/4An+6mETpLrKJ5hQkB8W4kFAadeIQ==
electron-to-chromium@^1.4.202:
version "1.4.226"
resolved "https://registry.yarnpkg.com/electron-to-chromium/-/electron-to-chromium-1.4.226.tgz#837ea1d19b8305a913cd5f31d135681c4b6d63b1"
@ -1819,6 +1868,11 @@ fill-range@^7.0.1:
dependencies:
to-regex-range "^5.0.1"
filter-obj@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/filter-obj/-/filter-obj-1.1.0.tgz#9b311112bc6c6127a16e016c6c5d7f19e0805c5b"
integrity sha512-8rXg1ZnX7xzy2NGDVkBVaAy+lSlPNwad13BtgSlLuxfIslyt5Vg64U7tFcCt4WS1R0hvtnQybT/IyCkGZ3DpXQ==
find-replace@^3.0.0:
version "3.0.0"
resolved "https://registry.npmjs.org/find-replace/-/find-replace-3.0.0.tgz"
@ -3428,6 +3482,16 @@ puppeteer@^19.8.5:
proxy-from-env "1.1.0"
puppeteer-core "19.8.5"
query-string@<8.x:
version "7.1.3"
resolved "https://registry.yarnpkg.com/query-string/-/query-string-7.1.3.tgz#a1cf90e994abb113a325804a972d98276fe02328"
integrity sha512-hh2WYhq4fi8+b+/2Kg9CEge4fDPvHS534aOOvOZeQ3+Vf2mCFsaFBYj0i+iXcAq6I9Vzp5fjMFBlONvayDC1qg==
dependencies:
decode-uri-component "^0.2.2"
filter-obj "^1.1.0"
split-on-first "^1.0.0"
strict-uri-encode "^2.0.0"
react-is@^18.0.0:
version "18.2.0"
resolved "https://registry.yarnpkg.com/react-is/-/react-is-18.2.0.tgz#199431eeaaa2e09f86427efbb4f1473edb47609b"
@ -3532,6 +3596,11 @@ resolve@^1.20.0:
path-parse "^1.0.7"
supports-preserve-symlinks-flag "^1.0.0"
retry-axios@<3.x:
version "2.6.0"
resolved "https://registry.yarnpkg.com/retry-axios/-/retry-axios-2.6.0.tgz#d4dc5c8a8e73982e26a705e46a33df99a28723e0"
integrity sha512-pOLi+Gdll3JekwuFjXO3fTq+L9lzMQGcSq7M5gIjExcl3Gu1hd4XXuf5o3+LuSBsaULQH7DiNbsqPd1chVpQGQ==
retry@^0.12.0:
version "0.12.0"
resolved "https://registry.yarnpkg.com/retry/-/retry-0.12.0.tgz#1b42a6266a21f07421d1b0b54b7dc167b01c013b"
@ -3673,6 +3742,11 @@ source-map@^0.6.0, source-map@^0.6.1:
resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.6.1.tgz#74722af32e9614e9c287a8d0bbde48b5e2f1a263"
integrity sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==
split-on-first@^1.0.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/split-on-first/-/split-on-first-1.1.0.tgz#f610afeee3b12bce1d0c30425e76398b78249a5f"
integrity sha512-43ZssAJaMusuKWL8sKUBQXHWOpq8d6CfN/u1p4gUzfJkM05C8rxTmYrkIPTXapZpORA6LkkzcUulJ8FqA7Uudw==
split2@^2.1.0:
version "2.2.0"
resolved "https://registry.npmjs.org/split2/-/split2-2.2.0.tgz"
@ -3718,6 +3792,11 @@ stream-via@^1.0.4:
resolved "https://registry.npmjs.org/stream-via/-/stream-via-1.0.4.tgz"
integrity sha512-DBp0lSvX5G9KGRDTkR/R+a29H+Wk2xItOF+MpZLLNDWbEV9tGPnqLPxHEYjmiz8xGtJHRIqmI+hCjmNzqoA4nQ==
strict-uri-encode@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/strict-uri-encode/-/strict-uri-encode-2.0.0.tgz#b9c7330c7042862f6b142dc274bbcc5866ce3546"
integrity sha512-QwiXZgpRcKkhTj2Scnn++4PKtWsH0kpzZ62L2R6c/LUVYv7hVnZqcg2+sMuT6R7Jusu1vviK/MFsu6kNJfWlEQ==
string-length@^4.0.1:
version "4.0.2"
resolved "https://registry.yarnpkg.com/string-length/-/string-length-4.0.2.tgz#a8a8dc7bd5c1a82b9b3c8b87e125f66871b6e57a"