Update readme and cli

This commit is contained in:
ezwelty 2024-09-24 23:02:32 +02:00 committed by Ethan Welty
parent dc728e28a0
commit 0234726c85
14 changed files with 535 additions and 521 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@
/node_modules/
archive
.env
output

318
API.md
View File

@ -1,9 +1,6 @@
## Modules
<dl>
<dt><a href="#module_load">load</a></dt>
<dd><p>Load the provided source datasets.</p>
</dd>
<dt><a href="#module_names">names</a></dt>
<dd><p>Parse scientific names.</p>
</dd>
@ -11,7 +8,7 @@
<dd><p>Describe a source dataset.</p>
</dd>
<dt><a href="#module_sourceio">sourceio</a></dt>
<dd><p>Read and write source properties.</p>
<dd><p>Read and write sources and source properties.</p>
</dd>
<dt><a href="#module_taxamatch">taxamatch</a></dt>
<dd><p>Match scientific names.</p>
@ -80,30 +77,6 @@ binary files and same as UTF-8 for text.</p>
<dt><a href="#search">search(params, options)</a><code>Array.&lt;ArchiveEntry&gt;</code></dt>
<dd><p>Search log for matching entries.</p>
</dd>
<dt><a href="#geocode">geocode(address)</a><code>Promise.&lt;object&gt;</code></dt>
<dd><p>Geocode address.</p>
</dd>
<dt><a href="#geocodeCached">geocodeCached(address)</a><code>Promise.&lt;object&gt;</code></dt>
<dd><p>Geocode address with caching.</p>
</dd>
<dt><a href="#buildMapFromCrosswalks">buildMapFromCrosswalks(crosswalks)</a><code>Array.&lt;Object.&lt;string, Array.&lt;string&gt;&gt;&gt;</code></dt>
<dd><p>Build source-target field name map from crosswalks.</p>
</dd>
<dt><a href="#matchFieldName">matchFieldName(name, map)</a><code>Array.&lt;Object&gt;</code></dt>
<dd><p>Find potential target field names matching a source field name.</p>
</dd>
<dt><a href="#buildGetCapabilitiesUrl">buildGetCapabilitiesUrl(url)</a><code>string</code></dt>
<dd><p>Build WFS GetCapabilities URL.</p>
</dd>
<dt><a href="#parseCapabilities">parseCapabilities(xml)</a><code>object</code></dt>
<dd><p>Parse WFS GetCapabilities response.</p>
</dd>
<dt><a href="#chooseOutputFormat">chooseOutputFormat(formats)</a><code>string</code> | <code>null</code></dt>
<dd><p>Choose the output format.</p>
</dd>
<dt><a href="#buildGetFeatureUrl">buildGetFeatureUrl(url, capabilities, paging)</a><code>string</code></dt>
<dd><p>Build WFS GetFeature URL.</p>
</dd>
<dt><a href="#getBrowser">getBrowser()</a><code>Promise.&lt;puppeteer.Browser&gt;</code></dt>
<dd><p>Get cached browser instance.</p>
</dd>
@ -123,34 +96,26 @@ binary files and same as UTF-8 for text.</p>
<dd><p>Download web page as MHTML and log result.</p>
<p>Page is rendered in a headless browser (puppeteer) and saved as MHTML.</p>
</dd>
<dt><a href="#geocode">geocode(address)</a><code>Promise.&lt;object&gt;</code></dt>
<dd><p>Geocode address.</p>
</dd>
<dt><a href="#geocodeCached">geocodeCached(address)</a><code>Promise.&lt;object&gt;</code></dt>
<dd><p>Geocode address with caching.</p>
</dd>
<dt><a href="#buildGetCapabilitiesUrl">buildGetCapabilitiesUrl(url)</a><code>string</code></dt>
<dd><p>Build WFS GetCapabilities URL.</p>
</dd>
<dt><a href="#parseCapabilities">parseCapabilities(xml)</a><code>object</code></dt>
<dd><p>Parse WFS GetCapabilities response.</p>
</dd>
<dt><a href="#chooseOutputFormat">chooseOutputFormat(formats)</a><code>string</code> | <code>null</code></dt>
<dd><p>Choose the output format.</p>
</dd>
<dt><a href="#buildGetFeatureUrl">buildGetFeatureUrl(url, capabilities, paging)</a><code>string</code></dt>
<dd><p>Build WFS GetFeature URL.</p>
</dd>
</dl>
<a name="module_load"></a>
## load
Load the provided source datasets.
* * *
<a name="module_load..loadSources"></a>
### load~loadSources(path, [filters], [dir]) ⇒ <code>Array.&lt;Source&gt;</code>
Load sources from source properties.
**Kind**: inner method of [<code>load</code>](#module_load)
| Param | Type | Default | Description |
| --- | --- | --- | --- |
| path | <code>string</code> | | Directory of JS files containing source properties. |
| [filters] | <code>object</code> | <code>{}</code> | |
| filters.ids | <code>Array.&lt;string&gt;</code> | | Return only sources with these identifiers. |
| filters.countries | <code>Array.&lt;string&gt;</code> | | Return only source with these countries. |
| [dir] | <code>string</code> | <code>&quot;data/${id}/input&quot;</code> | Source input directory (template interpolated on source properties). |
* * *
<a name="module_names"></a>
## names
@ -1057,12 +1022,13 @@ Throw or print error to console (red).
<a name="module_sourceio"></a>
## sourceio
Read and write source properties.
Read and write sources and source properties.
* [sourceio](#module_sourceio)
* [~readSourceProperties(file)](#module_sourceio..readSourceProperties) ⇒ <code>Array.&lt;SourceProperties&gt;</code>
* [~writeSourceProperties(sources, file, currentFile)](#module_sourceio..writeSourceProperties)
* [~writeSourceProperties(sourceProps, file, currentFile)](#module_sourceio..writeSourceProperties)
* [~loadSources(file, [filters])](#module_sourceio..loadSources) ⇒ <code>Array.&lt;Source&gt;</code>
* * *
@ -1084,18 +1050,42 @@ Read source properties from a file.
<a name="module_sourceio..writeSourceProperties"></a>
### sourceio~writeSourceProperties(sources, file, currentFile)
### sourceio~writeSourceProperties(sourceProps, file, currentFile)
Write source properties to a file.
**Kind**: inner method of [<code>sourceio</code>](#module_sourceio)
| Param | Type | Description |
| --- | --- | --- |
| sources | <code>Array.&lt;SourceProperties&gt;</code> | Source properties. |
| sourceProps | <code>Array.&lt;SourceProperties&gt;</code> | Source properties. |
| file | <code>string</code> | Path to new source properties file. |
| currentFile | <code>string</code> | Path to current source properties file ( defaults to `file`). Used to replicate the header (everything before `module.exports`). |
* * *
<a name="module_sourceio..loadSources"></a>
### sourceio~loadSources(file, [filters]) ⇒ <code>Array.&lt;Source&gt;</code>
Load sources from source properties.
Crosswalks are modified for unit conversions and range parsing.
**Kind**: inner method of [<code>sourceio</code>](#module_sourceio)
| Param | Type | Default | Description |
| --- | --- | --- | --- |
| file | <code>string</code> | | Path to file containing source properties. |
| [filters] | <code>object</code> | <code>{}</code> | |
| filters.id | <code>Array.&lt;string&gt;</code> | | Filter by id. |
| filters.country | <code>Array.&lt;string&gt;</code> | | Filter by country. |
| filters.state | <code>Array.&lt;string&gt;</code> | | Filter by state. |
| filters.city | <code>Array.&lt;string&gt;</code> | | Filter by city. |
| filters.designation | <code>Array.&lt;string&gt;</code> | | Filter by designation. |
| filters.scope | <code>Array.&lt;string&gt;</code> | | Filter by scope. |
| filters.omit | <code>boolean</code> | | Whether to include sources flagged as `omit: true`. |
* * *
<a name="module_taxamatch"></a>
@ -1373,6 +1363,7 @@ Additional properties not used by [Source](Source) but used elsewhere.
| state | <code>string</code> | Local name of first-level administrative division (see https://en.wikipedia.org/wiki/List_of_administrative_divisions_by_country) with the exception of: - Ireland: NUTS 3 Region (https://en.wikipedia.org/wiki/NUTS_statistical_regions_of_Ireland) - Japan: Region (https://en.wikipedia.org/wiki/List_of_regions_of_Japan) - Netherlands: Province (https://en.wikipedia.org/wiki/Provinces_of_the_Netherlands) - New Zealand: Region (https://en.wikipedia.org/wiki/Regions_of_New_Zealand) - United Kingdom (England): Region (https://en.wikipedia.org/wiki/Regions_of_England) - United Kingdom (other): Country |
| city | <code>string</code> | Local name of city or municipality. |
| designation | <code>string</code> | Local name of `city` subset, administrative unit, university, or other institution if not `country`, `state`, or `city`. |
| scope | <code>string</code> | Scope or type of the inventory (e.g. `tree`, `tree-street`, `tree-street-main`, `tree-park`, `tree-notable`). |
| language | <code>string</code> | Language of contents as an [ISO 639-1](https://en.wikipedia.org/wiki/ISO_639-1) code (e.g. `en`) and an optional [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) region code (e.g. `en-AU`). |
| primary | <code>string</code> | `id` of the primary source (for grouping sources together). |
| long | <code>string</code> | Full name of the government body, university, or other institution (e.g. `City of Melbourne`). |
@ -1668,127 +1659,6 @@ descending.
| [options.maxDays] | <code>int</code> | Maximum age of result in days |
* * *
<a name="geocode"></a>
## geocode(address) ⇒ <code>Promise.&lt;object&gt;</code>
Geocode address.
**Kind**: global function
**Returns**: <code>Promise.&lt;object&gt;</code> - Geocode results.
| Param | Type | Description |
| --- | --- | --- |
| address | <code>string</code> | Address to geocode. |
* * *
<a name="geocodeCached"></a>
## geocodeCached(address) ⇒ <code>Promise.&lt;object&gt;</code>
Geocode address with caching.
**Kind**: global function
**Returns**: <code>Promise.&lt;object&gt;</code> - Geocode results.
| Param | Type | Description |
| --- | --- | --- |
| address | <code>string</code> | Address to geocode. |
* * *
<a name="buildMapFromCrosswalks"></a>
## buildMapFromCrosswalks(crosswalks) ⇒ <code>Array.&lt;Object.&lt;string, Array.&lt;string&gt;&gt;&gt;</code>
Build source-target field name map from crosswalks.
**Kind**: global function
**Returns**: <code>Array.&lt;Object.&lt;string, Array.&lt;string&gt;&gt;&gt;</code> - Lowercased source field names
mapped to each target field name.
| Param | Type | Description |
| --- | --- | --- |
| crosswalks | <code>Array.&lt;Object.&lt;string, (string\|function())&gt;&gt;</code> | Source crosswalks. |
* * *
<a name="matchFieldName"></a>
## matchFieldName(name, map) ⇒ <code>Array.&lt;Object&gt;</code>
Find potential target field names matching a source field name.
**Kind**: global function
| Param | Type | Description |
| --- | --- | --- |
| name | <code>string</code> | Source field name. |
| map | <code>Object.&lt;string, Array.&lt;string&gt;&gt;</code> | Target-source field name map. |
* * *
<a name="buildGetCapabilitiesUrl"></a>
## buildGetCapabilitiesUrl(url) ⇒ <code>string</code>
Build WFS GetCapabilities URL.
**Kind**: global function
| Param | Type | Description |
| --- | --- | --- |
| url | <code>string</code> | WFS server URL |
* * *
<a name="parseCapabilities"></a>
## parseCapabilities(xml) ⇒ <code>object</code>
Parse WFS GetCapabilities response.
**Kind**: global function
**Returns**: <code>object</code> - Parsed capabilities (version, outputFormats, typeNames,
resultTypes, resultPaging).
| Param | Type | Description |
| --- | --- | --- |
| xml | <code>string</code> | XML string |
* * *
<a name="chooseOutputFormat"></a>
## chooseOutputFormat(formats) ⇒ <code>string</code> \| <code>null</code>
Choose the output format.
**Kind**: global function
| Param | Type | Description |
| --- | --- | --- |
| formats | <code>Array.&lt;string&gt;</code> | List of output formats |
* * *
<a name="buildGetFeatureUrl"></a>
## buildGetFeatureUrl(url, capabilities, paging) ⇒ <code>string</code>
Build WFS GetFeature URL.
**Kind**: global function
| Param | Type | Default | Description |
| --- | --- | --- | --- |
| url | <code>string</code> | | WFS server URL (ideally with typeName parameter) |
| capabilities | <code>object</code> | | Server capabilities |
| paging | <code>boolean</code> | <code>false</code> | Whether to set a start index and max feature count |
* * *
<a name="getBrowser"></a>
@ -1893,3 +1763,93 @@ Page is rendered in a headless browser (puppeteer) and saved as MHTML.
* * *
<a name="geocode"></a>
## geocode(address) ⇒ <code>Promise.&lt;object&gt;</code>
Geocode address.
**Kind**: global function
**Returns**: <code>Promise.&lt;object&gt;</code> - Geocode results.
| Param | Type | Description |
| --- | --- | --- |
| address | <code>string</code> | Address to geocode. |
* * *
<a name="geocodeCached"></a>
## geocodeCached(address) ⇒ <code>Promise.&lt;object&gt;</code>
Geocode address with caching.
**Kind**: global function
**Returns**: <code>Promise.&lt;object&gt;</code> - Geocode results.
| Param | Type | Description |
| --- | --- | --- |
| address | <code>string</code> | Address to geocode. |
* * *
<a name="buildGetCapabilitiesUrl"></a>
## buildGetCapabilitiesUrl(url) ⇒ <code>string</code>
Build WFS GetCapabilities URL.
**Kind**: global function
| Param | Type | Description |
| --- | --- | --- |
| url | <code>string</code> | WFS server URL |
* * *
<a name="parseCapabilities"></a>
## parseCapabilities(xml) ⇒ <code>object</code>
Parse WFS GetCapabilities response.
**Kind**: global function
**Returns**: <code>object</code> - Parsed capabilities (version, outputFormats, typeNames,
resultTypes, resultPaging).
| Param | Type | Description |
| --- | --- | --- |
| xml | <code>string</code> | XML string |
* * *
<a name="chooseOutputFormat"></a>
## chooseOutputFormat(formats) ⇒ <code>string</code> \| <code>null</code>
Choose the output format.
**Kind**: global function
| Param | Type | Description |
| --- | --- | --- |
| formats | <code>Array.&lt;string&gt;</code> | List of output formats |
* * *
<a name="buildGetFeatureUrl"></a>
## buildGetFeatureUrl(url, capabilities, paging) ⇒ <code>string</code>
Build WFS GetFeature URL.
**Kind**: global function
| Param | Type | Default | Description |
| --- | --- | --- | --- |
| url | <code>string</code> | | WFS server URL (ideally with typeName parameter) |
| capabilities | <code>object</code> | | Server capabilities |
| paging | <code>boolean</code> | <code>false</code> | Whether to set a start index and max feature count |
* * *

View File

@ -1,82 +0,0 @@
# Archiver (draft)
[`lib/archive.js`](/lib/archive.js) contains a set of functions that together implement a basic versioned data archive. The sections below provide simple usage examples.
## Archive a web page
Uses [Puppeteer](https://pptr.dev) to render web pages with a headless brower (Chromium).
```js
const puppeteer = require('puppeteer')
const archive = require('./lib/archive')
URL = 'https://data.sa.gov.au/data/dataset/street-trees'
DATE = new Date()
BASENAME = 'response'
// Open a new browser page
browser = await puppeteer.launch()
page = await browser.newPage()
// Navigate the page to a URL
response = await archive.loadPage(URL, page)
if (response.status() < 300) {
// Save and log HTML
html = await archive.readPageHtml(page)
archive.logData({
data: html,
filename: `${BASENAME}.html`,
url: URL,
date: DATE,
type: 'page',
format: 'html',
status: response.status(),
headers: response.headers(),
})
// Save and log MHTML
mhtml = await archive.readPageMhtml(page)
archive.logData({
data: mhtml,
filename: `${BASENAME}.mhtml`,
url: URL,
date: DATE,
type: 'page',
format: 'mhtml',
status: response.status(),
headers: response.headers(),
})
} else {
// Log error
archive.log({
url: URL,
date: DATE,
type: 'page',
status: response.status(),
headers: response.headers(),
})
}
// Close browser
await browser.close()
```
## Archive a file
See the functions in [`lib/workflow.js`](/lib/workflow.js).
```js
const workflow = require('./lib/workflow')
// Download a remote file
await workflow.downloadFile({
url: 'https://path/to/remote/file',
})
// Register an existing local file
await workflow.registerFile({
file: '/path/to/local/file',
url: 'https://original/path/to/remote/file',
type: 'data'
})
```

101
README.md
View File

@ -5,8 +5,9 @@ Authors: Ethan Welty ([fallingfruit.org](https://fallingfruit.org)), Steve Benne
Scripts that fetch and process data about inventoried trees and other plants from government and university open data sources. The result is used, among other things, to populate [opentrees.org](https://opentrees.org).
- [Installation](#installation)
- [Overview](#overview)
- [Usage](#usage)
- [Target Schema](#target-schema)
- [Target schema](#target-schema)
- [Development](#development)
## Installation
@ -17,24 +18,49 @@ cd opentrees-harvester
yarn
```
Copy `.env.example` to `.env` and set the environment variables as needed.
```bash
cp .env.example .env
```
- `DATA_ARCHIVE` (default `archive/data`): Directory of the data archive. See [Caching](#caching).
- `DATA_ARCHIVE_LOG` (default `archive/data.jsonl`): Log file of the data archive. See [Caching](#caching).
- `GEOCODE_ARCHIVE` (default `archive/geocode`): Directory of the geocode archive. Address geocode results are stored as JSON in files with paths of the form `{address_hash}.json`.
- `GOOGLE_MAPS_API_KEY`: Google Maps API key for geocoding addresses.
## Overview
### Sources ([`sources.js`](sources.js))
Each source dataset is described as a Javascript object (see [`SourceProperties`](API.md#module_types..SourceProperties)) in a single giant array sorted nominally by `country`, `state`, `city`, `designation`, and `scope`. A schema `crosswalk` strives to map the source dataset to our [target schema](#target-schema).
### Downloading
The harvester downloads source `data`, `metadata`, and `license` from URLs using the specified [`DownloadMethod`](API.md#module_types..DownloadMethod), which includes file-based download, querying the ArcGIS Feature Layer API, or rendering the URL in a web browser.
### Caching
The harvester aggressively caches source data and metadata in order to avoid re-downloading files, track changes of files over time, and protect against the inevitable link rot. Files are stored in the archive (`DATA_ARCHIVE` environment variable) with paths of the form `{hash}/{timestamp}/{filename}`, where `hash` is either the checksum of the source URL (if downloaded from a URL) or the checksum of the file content (if not). Archived files are described in the log file (`DATA_ARCHIVE_LOG` environment variable), a [JSON Lines](http://jsonlines.org) file that records file path, content checksum, date, and other file properties (see [`ArchiveEntry`](API.md#module_types..ArchiveEntry)).
### Processing
Downloading, caching, and finally data processing are typically executed via the [`Source`](API.md#module_source..Source) class, which wraps source properties and provides methods for each step in the pipeline. Processing includes reading the source data with [GDAL](https://gdal.org), applying the schema crosswalk, and writing the result to a new file.
## Usage
### Source properties ([`sources/*.js`](sources))
Each source dataset is described as a Javascript `object` following the format described at [`API.md#SourceProperties`](API.md#SourceProperties). They are sorted into modules organized by country. The schema crosswalks (`crosswalk` properties) strive to map each source dataset to our [target schema](#target-schema).
### Command line interface ([`cli/*.js`](cli))
The command line interface provides a quick way to process all or a subset of the source datasets. See each command's help message:
The command line interface provides a quick way to download and process all or a subset of the source datasets. See each command's help message:
```bash
yarn get -h
yarn process -h
yarn get --help
yarn process --help
```
### Source class ([`lib/source.js`](lib/source.js))
The `Source` class wraps source properties to facilitate data processing. All methods are documented at [`API.md`](API.md#module_source..Source).
The [`Source`](API.md#module_source..Source) class wraps source properties to facilitate data processing.
Here is a simple example using the included [`tests/simple.csv`](tests/simple.csv):
@ -44,7 +70,7 @@ const Source = require('./lib/source')
const source = new Source(
props = {
id: 'test',
download: 'https://raw.githubusercontent.com/ezwelty/opentrees-harvester/main/tests/simple.csv',
data: 'https://raw.githubusercontent.com/ezwelty/opentrees-harvester/main/tests/simple.csv',
geometry: { x: 'LON', y: 'LAT' },
srs: 'EPSG:4326',
crosswalk: {
@ -52,26 +78,35 @@ const source = new Source(
common: x => x['NAME'].toLowerCase(),
height_cm: 'HEIGHT_CM'
}
},
dir = 'test/input'
}
)
```
Use [`Source.get()`](API.md/#module_source..Source+get) to download remote files (`source.props.download`) to the source directory (`source.dir`) and prepare them for processing.
Use [`Source.findFiles()`](API.md#module_source..Source+findFiles) to download the remote data file (`source.props.data`) to the archive.
```js
source.get()
// Promise { <pending> }
// [test] Downloading simple.csv
// [test] Downloaded simple.csv (0 MB)
// [test] Ready to process
await source.fetchFiles('data') // 'data' (default), 'metadata', or 'license'
// [
// {
// date: 2024-09-24T20:41:22.507Z,
// url: 'https://raw.githubusercontent.com/ezwelty/opentrees-harvester/main/tests/simple.csv',
// method: 'file',
// checksum: '7303b0bda0ca68c7db73922af340e4aa',
// path: 'archive/data/d60579b4f36793bb54f6f4790bd683a2/2024-09-24T204122.507Z/simple.csv.txt',
// props: { type: 'data' }
// }
// ]
```
Optionally, use [`Source.find()`](API.md/#module_source..Source+find) to check that we downloaded a file recognized by GDAL, then [`Source.getRows()`](API.md/#module_source..Source+getRows) (or `Source.getFields()`, `Source.sample()`, `Source.glimpse()`, etc) to read content from the file with GDAL.
Optionally, use [`Source.findFiles()`](API.md#module_source..Source+findFiles) to retrieve them from the archive without downloading them. The output would be the same as above.
We can now open the dataset with GDAL, then use one of the many methods to inspect it ([`Source.getRows()`](API.md#module_source..Source+getRows), [`Source.glimpse()`](API.md#module_source..Source+glimpse), etc).
```js
source.find()
// 'test/input/simple.csv'
// Note: We need to set the GDAL driver exlicitly because the data was downloaded as
// '.csv.txt' instead of '.csv'.
source.props.driver = 'CSV'
await source.open()
source.getRows(1)
// [
// {
@ -84,13 +119,12 @@ source.getRows(1)
// ]
```
Use [`Source.process()`](API.md/#module_source..Source+process) to process the input and write the result to a new file. In this case, this includes (1) writing a [VRT file](https://gdal.org/drivers/vector/vrt.html) to tell [GDAL](https://gdal.org) which spatial reference system and geometry field names to use when reading the input and (2) applying our schema crosswalk (`source.props.crosswalk`).
Use [`Source.process()`](API.md/#module_source..Source+process) to process the input and write the result to a new file. In this case, this includes applying our schema crosswalk (`source.props.crosswalk`).
```js
source.process('test/output/output.csv')
// [test] Processing test/input/simple.csv
// [test] Writing and reading VRT file
// [test] Wrote output: test/output/output.csv
await source.process('output/test.csv', { overwrite: true })
// [test] Processing CSV:archive/data/d60579b4f36793bb54f6f4790bd683a2/2024-09-24T204122.507Z/simple.csv.txt
// [test] Wrote output: output/test.csv
```
We can modify the crosswalk following our conventions to apply unit conversions and other cleaning steps (see [`lib/convert.js`](lib/convert.js)). In this case, `height_cm` (in centimeters) is automatically converted to standard `height` (in meters).
@ -98,17 +132,7 @@ We can modify the crosswalk following our conventions to apply unit conversions
```js
const { modifyCrosswalk } = require('./lib/convert.js')
source.props.crosswalk = modifyCrosswalk(source.props.crosswalk)
source.process('test/output/output.csv', {overwrite: true})
```
Finally, the result can also be inspected using the `Source` class.
```js
const out = new Source({id: 'out'}, 'test/output')
out.find()
// 'test/output/output.csv'
out.getRows(1)
// [ { ref: '1', common: 'loquat', height: '12' } ]
await source.process('output/test.csv', { overwrite: true })
```
### Scientific name matching
@ -266,8 +290,9 @@ Numeric and date ranges use the field name suffixes `_min` and `_max`. For examp
## Development
The source code is documented using inline [JSDoc 3](https://jsdoc.app/) comments. Update the API documentation ([API.md](API.md)) from the source code by running:
The source code is documented using inline [JSDoc 3](https://jsdoc.app) comments. Update the API documentation ([API.md](API.md)) from the source code by running:
```bash
yarn test
yarn docs
```

View File

@ -1,20 +1,34 @@
const DEFAULT_OPTIONS = [
{
name: 'help', alias: 'h', type: Boolean, defaultValue: false
name: 'help', type: Boolean, defaultValue: false
},
{
name: 'ids', alias: 'i', type: String, multiple: true, defaultOption: true,
description: 'Restrict to these source identifiers.'
name: 'id', type: String, multiple: true,
description: 'Restrict by id.\nNote: These are currently assigned automatically on load based on source properties and are thus subject to change.'
},
{
name: 'countries', alias: 'c', type: String, multiple: true,
description: 'Restrict to these source countries (case and whitespace insensitive).'
name: 'country', type: String, multiple: true,
description: 'Restrict by country.'
},
{
name: 'dir', alias: 'd', type: String, defaultValue: 'data/${id}/input',
// Escape special characters for chalk. See https://github.com/Polymer/tools/pull/612
description: "Template for input directory, with source properties referred to by name (default: 'data/${id}/input').".
replace(/[{}\\]/g, '\\$&')
name: 'city', type: String, multiple: true,
description: 'Restrict by city.'
},
{
name: 'state', type: String, multiple: true,
description: 'Restrict by state.'
},
{
name: 'designation', type: String, multiple: true,
description: 'Restrict by designation.'
},
{
name: 'scope', type: String, multiple: true,
description: 'Restrict by scope.'
},
{
name: 'omit', type: Boolean, defaultValue: false,
description: 'Whether to keep sources flagged as `omit: true`.'
}
]

View File

@ -2,20 +2,31 @@
const commandLineUsage = require('command-line-usage')
const commandLineArgs = require('command-line-args')
const { DEFAULT_OPTIONS } = require('./common')
const { loadSources } = require('../lib/load')
const { loadSources } = require('../lib/sourceio')
const OPTIONS = [
...DEFAULT_OPTIONS,
{
name: 'force', alias: 'f', type: Boolean, defaultValue: false,
description: 'Overwrite input directory even if it is not empty.'
name: 'max-days', type: Number, defaultValue: null,
description: 'Maximum age (in days) of archived file (if older, re-download).'
},
{
name: 'type', type: String, defaultValue: 'data',
description: 'Type of file to download (data, metadata, license).'
},
{
name: 'format', type: String, defaultValue: 'pdf',
description: 'Format of browser download (pdf, png, mhtml, html).'
}
]
const USAGE = [
{
header: 'example/get.js',
content: 'Download remote files, unpack compressed or archive files, and execute shell commands to prepare source files for processing.'
header: 'get.js',
content: (
'Download files if missing or older than a maximum age in the archive.\n' +
'Source filters are case and whitespace insensitive.'
)
},
{
header: 'Options',
@ -26,7 +37,7 @@ const USAGE = [
// Parse command line arguments
let options
try {
options = commandLineArgs(OPTIONS)
options = commandLineArgs(OPTIONS, { camelCase: true })
if (options.help) {
console.log(commandLineUsage(USAGE))
process.exit(0)
@ -38,48 +49,53 @@ try {
}
// Load sources
const sources = loadSources(
`${__dirname}/../sources`,
{ ids: options.ids, countries: options.countries },
options.dir
)
const filters = {}
const filterKeys = ['id', 'country', 'city', 'state', 'designation', 'scope', 'omit']
filterKeys.forEach(key => filters[key] = options[key])
const sources = loadSources(`${__dirname}/../sources.js`, filters)
// Get sources
const success = []
const failure = []
const skip = []
async function getSource(source) {
const searchOptions = { format: options.format, maxDays: options.maxDays }
// Check if all files already exist
try {
const paths = await source.get(options.force)
if (paths.length) {
success.push(source.props.id)
} else {
skip.push(source.props.id)
}
await source.findFiles(options.type, searchOptions)
skip.push(source.props.id)
return
} catch (error) {
console.error(error.message)
failure.push(source.props.id)
// Download files
try {
await source.fetchFiles(options.type, searchOptions)
success.push(source.props.id)
} catch (error) {
console.error(error.message)
failure.push(source.props.id)
}
}
}
async function get() {
async function getSources() {
console.log(`Found ${sources.length} sources`)
await Promise.all(sources.map(source => getSource(source)))
if (success.length) {
console.log(
`${'[SUCCESS]'.green} Got ${success.length} sources:`,
success.join(', ')
`${'[SUCCESS]'.green} Downloaded ${success.length} sources:\n\n` +
success.join('\n')
)
}
if (failure.length) {
console.error(
`${'[ERROR]'.red} Failed to get ${failure.length} sources:`,
failure.join(', ')
`${'[ERROR]'.red} Failed to download ${failure.length} sources:\n\n` +
`${failure.join('\n')}`
)
}
if (skip.length) {
console.log(
`${'[SKIPPED]'.dim} Skipped ${skip.length} sources:`,
skip.join(', ')
`${'[SKIPPED]'.dim} ${skip.length} sources already in the archive (or empty):\n\n` +
skip.join('\n')
)
}
}
get()
getSources()

View File

@ -3,48 +3,62 @@ const colors = require('colors')
const commandLineUsage = require('command-line-usage')
const commandLineArgs = require('command-line-args')
const { DEFAULT_OPTIONS } = require('./common')
const { loadSources } = require('../lib/load')
const { deleteFeature } = require('../lib/clean')
const { loadSources } = require('../lib/sourceio')
const { interpolateString } = require('../lib/helpers')
const OPTIONS = [
...DEFAULT_OPTIONS,
{
name: 'out', alias: 'o', type: String, defaultValue: 'data/${id}/output/output.csv',
name: 'file', type: String, defaultValue: 'output/${id}.csv',
// Escape special characters for chalk. See https://github.com/Polymer/tools/pull/612
description: "Template for output file, with source properties referred to by name (default: 'data/${id}/output/output.csv').".
description: "Template for output file path, with source properties referred to by name (default: 'output/${id}.csv').".
replace(/[{}\\]/g, '\\$&')
},
{
name: 'driver', type: String,
description: (
'Name of GDAL driver to use for output (see https://gdal.org/drivers/vector). ' +
'Guessed from file extension if not provided.'
)
},
{
name: 'creation', type: String, multiple: true,
defaultValue: ['GEOMETRY=AS_WKT', 'STRING_QUOTING=IF_NEEDED'],
description: 'Driver-specific dataset creation options (see https://gdal.org/drivers/vector).'
},
{
name: 'overwrite', type: Boolean, defaultValue: false,
description: 'Overwrite output file even if it already exists.'
},
{
name: 'centroids', type: Boolean, defaultValue: false,
description: 'Whether to reduce non-point geometries to centroids.'
},
{
name: 'keepInvalid', type: Boolean, defaultValue: false,
name: 'keep-invalid', type: Boolean, defaultValue: false,
description: 'Whether to keep features with empty or invalid geometries.'
},
{
name: 'keepFields', type: Boolean, defaultValue: false,
name: 'keep-fields', type: Boolean, defaultValue: false,
description: 'Whether to keep the input feature fields alongside the result of the schema crosswalk.'
},
{
name: 'prefix', type: String, defaultValue: '',
description: 'String to append to input field names to prevent collisions with output field names. Applies only with `keepFields`.'
description: 'String to append to input field names to prevent collisions with output field names. Applies only with `keep-fields`.'
},
{
name: 'bounds', type: Number, multiple: true,
description: 'Bounding box in the format [xmin, ymin, xmax, ymax] (EPSG:4326). If provided, features outside the bounds are skipped.'
},
{
name: 'force', alias: 'f', type: Boolean, defaultValue: false,
description: 'Overwrite output file even if it already exists.'
}
]
const USAGE = [
{
header: 'example/process.js',
content: 'Process sources and save as new vector files.',
header: 'process.js',
content: (
'Process sources and save as new vector files.\n' +
'Source filters are case and whitespace insensitive.'
)
},
{
header: 'Options',
@ -55,7 +69,7 @@ const USAGE = [
// Parse command line arguments
let options
try {
options = commandLineArgs(OPTIONS)
options = commandLineArgs(OPTIONS, { camelCase: true })
if (options.help) {
console.log(commandLineUsage(USAGE))
process.exit(0)
@ -67,52 +81,60 @@ try {
}
// Load sources
const sources = loadSources(
`${__dirname}/../sources`,
{ ids: options.ids, countries: options.countries },
options.dir
)
const filters = {}
const filterKeys = ['id', 'country', 'city', 'state', 'designation', 'scope', 'omit']
filterKeys.forEach(key => filters[key] = options[key])
const sources = loadSources(`${__dirname}/../sources.js`, filters)
// Process sources
const success = []
const failure = []
const skip = []
const processOptions = {
overwrite: options.force,
driver: options.driver,
creation: options.creation,
overwrite: options.overwrite,
centroids: options.centroids,
keepInvalid: options.keepInvalid,
keepFields: options.keepFields,
prefix: options.prefix,
bounds: options.bounds,
deleteFunc: deleteFeature
}
sources.forEach(source => {
const file = interpolateString(options.out, source.props)
async function processSource(source) {
const file = interpolateString(options.file, source.props)
try {
const result = source.process(file, processOptions)
if (result) success.push(source.props.id)
else skip.push(source.props.id)
const result = await source.process(file, processOptions)
if (result) {
success.push(source.props.id)
} else {
skip.push(source.props.id)
}
} catch (error) {
console.error(error.message)
failure.push(source.props.id)
}
})
if (success.length) {
console.log(
`${'[SUCCESS]'.green} Processed ${success.length} sources:`,
success.join(', ')
)
}
if (failure.length) {
console.error(
`${'[ERROR]'.red} Failed to process ${failure.length} sources:`,
failure.join(', ')
)
}
if (skip.length) {
console.log(
`${'[SKIPPED]'.dim} Skipped ${skip.length} sources:`,
skip.join(', ')
)
async function processSources() {
for (const source of sources) {
await processSource(source)
}
if (success.length) {
console.log(
`${'[SUCCESS]'.green} Processed ${success.length} sources:\n\n` +
success.join('\n')
)
}
if (failure.length) {
console.error(
`${'[ERROR]'.red} Failed to process ${failure.length} sources:\n\n` +
failure.join('\n')
)
}
if (skip.length) {
console.log(
`${'[SKIPPED]'.dim} ${skip.length} sources already existed:\n\n` +
skip.join('\n')
)
}
}
processSources()

View File

@ -1,4 +1,5 @@
require('dotenv').config()
const path = require('path')
const fs = require('fs')
const mime = require('mime-types')
const puppeteer = require('puppeteer')

View File

@ -1,88 +0,0 @@
/**
* Load the provided source datasets.
*
* @module
*/
const { resolve } = require('path')
const glob = require('glob')
const colors = require('colors')
const Source = require('./source')
const { interpolateString, reduceString } = require('./helpers')
const { modifyCrosswalk } = require('./convert')
/**
* Load sources from source properties.
*
* @param {string} path - Directory of JS files containing source properties.
* @param {object} [filters={}]
* @param {string[]} filters.ids - Return only sources with these identifiers.
* @param {string[]} filters.countries - Return only source with these countries.
* @param {string} [dir=data/${id}/input] - Source input directory (template
* interpolated on source properties).
* @returns {Source[]}
*/
function loadSources(path, filters = {}, dir = 'data/${id}/input') {
// Load source properties
const globOptions = { absolute: true, nodir: true, cwd: resolve(path) }
var sourceProps = glob.sync('**/*.js', globOptions).
map(file => require(file)).
flat()
// Ensure that source identifiers are unique
const all = sourceProps.map(props => props.id)
const duplicated = all.filter((item, index) => all.indexOf(item) != index)
if (duplicated.length) {
throw new Error(
`Duplicate source identifiers: ${[...new Set(duplicated)].join(', ')}`)
}
// Filter source properties
sourceProps = sourceProps.filter(props => {
return (
(
!filters.ids ||
filters.ids.map(x => reduceString(x)).includes(reduceString(props.id))
) &&
(
!filters.countries ||
filters.countries.map(x => reduceString(x)).includes(reduceString(props.country))
)
)
})
const sources = []
const invalid = []
for (const props of sourceProps) {
let input
let source
try {
// Modify crosswalk for unit conversions and range parsing
props.crosswalk = modifyCrosswalk(props.crosswalk)
input = interpolateString(dir, props)
} catch (error) {
const tag = `[${props.id}]`.red
console.error(`${tag} ${error.message}`)
invalid.push(props.id)
continue
}
try {
// Convert to Source class
source = new Source(props, input)
} catch (error) {
console.error(error.message)
invalid.push(props.id)
continue
}
sources.push(source)
}
// Report failures
if (invalid.length) {
console.error(
`${'[ERROR]'.red} Skipped ${invalid.length} invalid source(s):`,
invalid.join(', ')
)
}
return sources
}
module.exports = {
loadSources
}

View File

@ -1,3 +1,10 @@
/**
* Match source column names to target schema based on existing crosswalks.
*
* @module
* @private
*/
const { parseFieldName } = require('./convert.js')
/**
@ -5,7 +12,7 @@ const { parseFieldName } = require('./convert.js')
*
* Assumes that the function accesses properties of an object named `x`.
*
* @param {*} func - Function whose source to extract field names from.
* @param {function} func - Function whose source to extract field names from.
* @returns {string[]} Field names accessed by the function.
* @private
* @example
@ -107,7 +114,7 @@ function matchFieldName(name, map) {
*
* @param {string[]} names - Source field names.
* @param {Object.<string, string[]>} map - Target-source field name map.
* @returns {Array[Object.<string, Object[]>, string[]]} Target field names mapped to
* @returns {Array} Target field names mapped to
* source field `name`, result `count`, and number of target field name `matches`
* (if greater than 1), followed by unmatched source field names.
* @example
@ -156,7 +163,7 @@ function guessCrosswalk(names, map) {
/**
* Print crosswalk guess to console.
*
* @param {Array[Object.<string, Object[]>, string[]]} guess - Crosswalk guess.
* @param {Array} guess - Crosswalk guess.
* @example
* guess = guessCrosswalk(['foz', 'foo', 'zzz'], {bar: ['foo', 'foo', 'foz'], baz: ['foo']})
* printCrosswalkGuess(guess)

View File

@ -12,7 +12,7 @@ const gdal = require('gdal-async')
const { table } = require('table')
const helpers = require('./helpers')
const archive = require('./archive')
const workflow = require('./workflow')
const workflow = require('./archiveWorkflow')
const LICENSES = require('./licenses')
const {ArchiveFile, ArchiveEntry, SourceProperties, BrowserFormat, FileType, SourceFile} = require('./types')
@ -584,55 +584,57 @@ class Source {
}
outputFeature.fields.set(outputFields)
// Geometry
let inputGeometry = this.getFeatureGeometry(
inputFeature, { fields: inputFields, srs, isXY }
)
let isValid = false
if (inputGeometry) {
// Reduce to centroid if needed
if (options.centroids && inputGeometry.wkbType != gdal.wkbPoint) {
inputGeometry = inputGeometry.centroid()
// HACK: Centroid sometimes loses SRS
inputGeometry.srs = srs
}
// Check if geometry is valid
isValid = inputGeometry.isValid()
const isPoint = inputGeometry.wkbType == gdal.wkbPoint
if (isPoint) {
isValid = Boolean(
isValid &&
inputGeometry.x &&
inputGeometry.y &&
isFinite(inputGeometry.x) &&
isFinite(inputGeometry.y)
)
}
// Transform geometry
if (isValid && transform) {
try {
inputGeometry.transform(transform)
} catch (error) {
isValid = false
if (outputLayer.geomType != gdal.wkbNone) {
let inputGeometry = this.getFeatureGeometry(
inputFeature, { fields: inputFields, srs, isXY }
)
let isValid = false
if (inputGeometry) {
// Reduce to centroid if needed
if (options.centroids && inputGeometry.wkbType != gdal.wkbPoint) {
inputGeometry = inputGeometry.centroid()
// HACK: Centroid sometimes loses SRS
inputGeometry.srs = srs
}
// Check if geometry is valid
isValid = inputGeometry.isValid()
const isPoint = inputGeometry.wkbType == gdal.wkbPoint
if (isPoint) {
isValid = Boolean(
isValid &&
inputGeometry.x &&
inputGeometry.y &&
isFinite(inputGeometry.x) &&
isFinite(inputGeometry.y)
)
}
// Transform geometry
if (isValid && transform) {
try {
inputGeometry.transform(transform)
} catch (error) {
isValid = false
}
}
if (isValid && options.bounds) {
if (!inputGeometry.within(options.bounds)) {
counts.outOfBoundGeometries++
continue
}
}
}
if (isValid && options.bounds) {
if (!inputGeometry.within(options.bounds)) {
counts.outOfBoundGeometries++
if (!isValid) {
counts.invalidGeometries++
if (!options.keepInvalid) {
continue
}
inputGeometry = null
}
}
if (!isValid) {
counts.invalidGeometries++
if (!options.keepInvalid) {
continue
if (inputGeometry) {
// Flatten 3D geometries to 2D
inputGeometry.flattenTo2D()
outputFeature.setGeometry(inputGeometry)
}
inputGeometry = null
}
if (inputGeometry) {
// Flatten 3D geometries to 2D
inputGeometry.flattenTo2D()
outputFeature.setGeometry(inputGeometry)
}
outputLayer.features.add(outputFeature)
}

View File

@ -1,12 +1,15 @@
/**
* Read and write source properties.
* Read and write sources and source properties.
*
* @module
*/
const path = require('path')
const util = require('util')
const fs = require('fs')
const colors = require('colors')
const Source = require('./source')
const { reduceString } = require('./helpers')
const { modifyCrosswalk } = require('./convert')
const {SourceProperties} = require('./types')
/**
@ -24,18 +27,18 @@ function readSourceProperties(file) {
/**
* Write source properties to a file.
*
* @param {SourceProperties[]} sources - Source properties.
* @param {SourceProperties[]} sourceProps - Source properties.
* @param {string} file - Path to new source properties file.
* @param {string} currentFile - Path to current source properties file (
* defaults to `file`). Used to replicate the header
* (everything before `module.exports`).
*/
function writeSourceProperties(sources, file, currentFile) {
function writeSourceProperties(sourceProps, file, currentFile) {
if (!currentFile) {
currentFile = file
}
// const copies = structuredClone(sources)
const copies = sources
// const copies = structuredClone(sourceProps)
const copies = sourceProps
// Define custom inspection for functions
function inspectFunction() {
const txt = this.toString()
@ -77,7 +80,139 @@ function writeSourceProperties(sources, file, currentFile) {
fs.writeFileSync(file, `${headerMatch[1]}module.exports = ${txt}\n`)
}
/**
* Build unique source ids from source properties.
*
* @param {SourceProperties[]} sourceProps - Source properties.
* @returns {string[]} Source ids.
* @private
*/
function buildSourceIds(sourceProps) {
const ids = sourceProps.map(props =>
[props.country, props.state, props.city, props.designation]
.filter(Boolean).join(' > ')
)
// Append scope to duplicate ids if scope not same for all duplicates
let duplicates = ids.filter(id => ids.filter(x => x == id).length > 1)
duplicates.forEach(duplicate => {
// Get scopes for id
let scopes = []
ids.forEach((id, index) => {
if (id == duplicate) {
scopes.push(sourceProps[index].scope)
}
})
// If scopes not all equal, append scope to id
if (scopes.some(scope => scope != scopes[0])) {
ids.forEach((id, index) => {
if (id == duplicate) {
ids[index] = `${id} > ${sourceProps[index].scope}`
}
})
}
})
// Append integer to duplicate ids
duplicates = ids.filter(id => ids.filter(x => x == id).length > 1)
duplicates.forEach(duplicate => {
let counter = 1
ids.forEach((id, index) => {
if (id == duplicate) {
ids[index] = `${id} > ${counter++}`
}
})
})
return ids
}
/**
* Load sources from source properties.
*
* Crosswalks are modified for unit conversions and range parsing.
*
* @param {string} file - Path to file containing source properties.
* @param {object} [filters={}]
* @param {string[]} filters.id - Filter by id.
* @param {string[]} filters.country - Filter by country.
* @param {string[]} filters.state - Filter by state.
* @param {string[]} filters.city - Filter by city.
* @param {string[]} filters.designation - Filter by designation.
* @param {string[]} filters.scope - Filter by scope.
* @param {boolean} filters.omit - Whether to include sources flagged as `omit: true`.
* @returns {Source[]}
*/
function loadSources(file, filters = {}) {
// Load source properties
var sourceProps = readSourceProperties(file)
// Assign unique ids
const ids = buildSourceIds(sourceProps)
sourceProps.forEach((props, index) => {
props.id = ids[index]
})
// Filter source properties
if (filters.omit !== true) {
sourceProps = sourceProps.filter(props => !props.omit)
}
const filteredProps = []
for (const props of sourceProps) {
var include = true
for (const key in filters) {
if (key == 'omit') {
continue
}
const filter = filters[key]
if (!filter || !filter.length) {
continue
}
const value = props[key] ? reduceString(props[key]) : null
if (!filter.map(reduceString).includes(value)) {
include = false
break
}
}
if (include) {
filteredProps.push(props)
}
}
// Load sources
const sources = []
const invalid = []
for (const props of filteredProps) {
let source
if (props.crosswalk) {
try {
// Modify crosswalk for unit conversions and range parsing
props.crosswalk = modifyCrosswalk(props.crosswalk)
} catch (error) {
const tag = `[${props.id}]`.red
console.error(`${tag} ${error.message}`)
invalid.push(props.id)
continue
}
}
try {
// Convert to Source class
source = new Source(props)
} catch (error) {
console.error(error.message)
invalid.push(props.id)
continue
}
sources.push(source)
}
// Report failures
if (invalid.length) {
console.error(
`${'[ERROR]'.red} Skipped ${invalid.length} invalid source(s):\n\n` +
invalid.join('\n')
)
}
return sources
}
module.exports = {
readSourceProperties,
writeSourceProperties
writeSourceProperties,
loadSources,
buildSourceIds
}

View File

@ -1,3 +1,4 @@
const fs = require('fs')
const gdal = require('gdal-async')
const helpers = require('./lib/helpers')