mirror of
https://github.com/ezwelty/opentrees-harvester.git
synced 2025-06-18 10:45:42 -04:00
Update readme and cli
This commit is contained in:
parent
dc728e28a0
commit
0234726c85
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@
|
||||
/node_modules/
|
||||
archive
|
||||
.env
|
||||
output
|
||||
|
318
API.md
318
API.md
@ -1,9 +1,6 @@
|
||||
## Modules
|
||||
|
||||
<dl>
|
||||
<dt><a href="#module_load">load</a></dt>
|
||||
<dd><p>Load the provided source datasets.</p>
|
||||
</dd>
|
||||
<dt><a href="#module_names">names</a></dt>
|
||||
<dd><p>Parse scientific names.</p>
|
||||
</dd>
|
||||
@ -11,7 +8,7 @@
|
||||
<dd><p>Describe a source dataset.</p>
|
||||
</dd>
|
||||
<dt><a href="#module_sourceio">sourceio</a></dt>
|
||||
<dd><p>Read and write source properties.</p>
|
||||
<dd><p>Read and write sources and source properties.</p>
|
||||
</dd>
|
||||
<dt><a href="#module_taxamatch">taxamatch</a></dt>
|
||||
<dd><p>Match scientific names.</p>
|
||||
@ -80,30 +77,6 @@ binary files and same as UTF-8 for text.</p>
|
||||
<dt><a href="#search">search(params, options)</a> ⇒ <code>Array.<ArchiveEntry></code></dt>
|
||||
<dd><p>Search log for matching entries.</p>
|
||||
</dd>
|
||||
<dt><a href="#geocode">geocode(address)</a> ⇒ <code>Promise.<object></code></dt>
|
||||
<dd><p>Geocode address.</p>
|
||||
</dd>
|
||||
<dt><a href="#geocodeCached">geocodeCached(address)</a> ⇒ <code>Promise.<object></code></dt>
|
||||
<dd><p>Geocode address with caching.</p>
|
||||
</dd>
|
||||
<dt><a href="#buildMapFromCrosswalks">buildMapFromCrosswalks(crosswalks)</a> ⇒ <code>Array.<Object.<string, Array.<string>>></code></dt>
|
||||
<dd><p>Build source-target field name map from crosswalks.</p>
|
||||
</dd>
|
||||
<dt><a href="#matchFieldName">matchFieldName(name, map)</a> ⇒ <code>Array.<Object></code></dt>
|
||||
<dd><p>Find potential target field names matching a source field name.</p>
|
||||
</dd>
|
||||
<dt><a href="#buildGetCapabilitiesUrl">buildGetCapabilitiesUrl(url)</a> ⇒ <code>string</code></dt>
|
||||
<dd><p>Build WFS GetCapabilities URL.</p>
|
||||
</dd>
|
||||
<dt><a href="#parseCapabilities">parseCapabilities(xml)</a> ⇒ <code>object</code></dt>
|
||||
<dd><p>Parse WFS GetCapabilities response.</p>
|
||||
</dd>
|
||||
<dt><a href="#chooseOutputFormat">chooseOutputFormat(formats)</a> ⇒ <code>string</code> | <code>null</code></dt>
|
||||
<dd><p>Choose the output format.</p>
|
||||
</dd>
|
||||
<dt><a href="#buildGetFeatureUrl">buildGetFeatureUrl(url, capabilities, paging)</a> ⇒ <code>string</code></dt>
|
||||
<dd><p>Build WFS GetFeature URL.</p>
|
||||
</dd>
|
||||
<dt><a href="#getBrowser">getBrowser()</a> ⇒ <code>Promise.<puppeteer.Browser></code></dt>
|
||||
<dd><p>Get cached browser instance.</p>
|
||||
</dd>
|
||||
@ -123,34 +96,26 @@ binary files and same as UTF-8 for text.</p>
|
||||
<dd><p>Download web page as MHTML and log result.</p>
|
||||
<p>Page is rendered in a headless browser (puppeteer) and saved as MHTML.</p>
|
||||
</dd>
|
||||
<dt><a href="#geocode">geocode(address)</a> ⇒ <code>Promise.<object></code></dt>
|
||||
<dd><p>Geocode address.</p>
|
||||
</dd>
|
||||
<dt><a href="#geocodeCached">geocodeCached(address)</a> ⇒ <code>Promise.<object></code></dt>
|
||||
<dd><p>Geocode address with caching.</p>
|
||||
</dd>
|
||||
<dt><a href="#buildGetCapabilitiesUrl">buildGetCapabilitiesUrl(url)</a> ⇒ <code>string</code></dt>
|
||||
<dd><p>Build WFS GetCapabilities URL.</p>
|
||||
</dd>
|
||||
<dt><a href="#parseCapabilities">parseCapabilities(xml)</a> ⇒ <code>object</code></dt>
|
||||
<dd><p>Parse WFS GetCapabilities response.</p>
|
||||
</dd>
|
||||
<dt><a href="#chooseOutputFormat">chooseOutputFormat(formats)</a> ⇒ <code>string</code> | <code>null</code></dt>
|
||||
<dd><p>Choose the output format.</p>
|
||||
</dd>
|
||||
<dt><a href="#buildGetFeatureUrl">buildGetFeatureUrl(url, capabilities, paging)</a> ⇒ <code>string</code></dt>
|
||||
<dd><p>Build WFS GetFeature URL.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
|
||||
<a name="module_load"></a>
|
||||
|
||||
## load
|
||||
Load the provided source datasets.
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="module_load..loadSources"></a>
|
||||
|
||||
### load~loadSources(path, [filters], [dir]) ⇒ <code>Array.<Source></code>
|
||||
Load sources from source properties.
|
||||
|
||||
**Kind**: inner method of [<code>load</code>](#module_load)
|
||||
|
||||
| Param | Type | Default | Description |
|
||||
| --- | --- | --- | --- |
|
||||
| path | <code>string</code> | | Directory of JS files containing source properties. |
|
||||
| [filters] | <code>object</code> | <code>{}</code> | |
|
||||
| filters.ids | <code>Array.<string></code> | | Return only sources with these identifiers. |
|
||||
| filters.countries | <code>Array.<string></code> | | Return only source with these countries. |
|
||||
| [dir] | <code>string</code> | <code>"data/${id}/input"</code> | Source input directory (template interpolated on source properties). |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="module_names"></a>
|
||||
|
||||
## names
|
||||
@ -1057,12 +1022,13 @@ Throw or print error to console (red).
|
||||
<a name="module_sourceio"></a>
|
||||
|
||||
## sourceio
|
||||
Read and write source properties.
|
||||
Read and write sources and source properties.
|
||||
|
||||
|
||||
* [sourceio](#module_sourceio)
|
||||
* [~readSourceProperties(file)](#module_sourceio..readSourceProperties) ⇒ <code>Array.<SourceProperties></code>
|
||||
* [~writeSourceProperties(sources, file, currentFile)](#module_sourceio..writeSourceProperties)
|
||||
* [~writeSourceProperties(sourceProps, file, currentFile)](#module_sourceio..writeSourceProperties)
|
||||
* [~loadSources(file, [filters])](#module_sourceio..loadSources) ⇒ <code>Array.<Source></code>
|
||||
|
||||
|
||||
* * *
|
||||
@ -1084,18 +1050,42 @@ Read source properties from a file.
|
||||
|
||||
<a name="module_sourceio..writeSourceProperties"></a>
|
||||
|
||||
### sourceio~writeSourceProperties(sources, file, currentFile)
|
||||
### sourceio~writeSourceProperties(sourceProps, file, currentFile)
|
||||
Write source properties to a file.
|
||||
|
||||
**Kind**: inner method of [<code>sourceio</code>](#module_sourceio)
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| sources | <code>Array.<SourceProperties></code> | Source properties. |
|
||||
| sourceProps | <code>Array.<SourceProperties></code> | Source properties. |
|
||||
| file | <code>string</code> | Path to new source properties file. |
|
||||
| currentFile | <code>string</code> | Path to current source properties file ( defaults to `file`). Used to replicate the header (everything before `module.exports`). |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="module_sourceio..loadSources"></a>
|
||||
|
||||
### sourceio~loadSources(file, [filters]) ⇒ <code>Array.<Source></code>
|
||||
Load sources from source properties.
|
||||
|
||||
Crosswalks are modified for unit conversions and range parsing.
|
||||
|
||||
**Kind**: inner method of [<code>sourceio</code>](#module_sourceio)
|
||||
|
||||
| Param | Type | Default | Description |
|
||||
| --- | --- | --- | --- |
|
||||
| file | <code>string</code> | | Path to file containing source properties. |
|
||||
| [filters] | <code>object</code> | <code>{}</code> | |
|
||||
| filters.id | <code>Array.<string></code> | | Filter by id. |
|
||||
| filters.country | <code>Array.<string></code> | | Filter by country. |
|
||||
| filters.state | <code>Array.<string></code> | | Filter by state. |
|
||||
| filters.city | <code>Array.<string></code> | | Filter by city. |
|
||||
| filters.designation | <code>Array.<string></code> | | Filter by designation. |
|
||||
| filters.scope | <code>Array.<string></code> | | Filter by scope. |
|
||||
| filters.omit | <code>boolean</code> | | Whether to include sources flagged as `omit: true`. |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="module_taxamatch"></a>
|
||||
@ -1373,6 +1363,7 @@ Additional properties not used by [Source](Source) but used elsewhere.
|
||||
| state | <code>string</code> | Local name of first-level administrative division (see https://en.wikipedia.org/wiki/List_of_administrative_divisions_by_country) with the exception of: - Ireland: NUTS 3 Region (https://en.wikipedia.org/wiki/NUTS_statistical_regions_of_Ireland) - Japan: Region (https://en.wikipedia.org/wiki/List_of_regions_of_Japan) - Netherlands: Province (https://en.wikipedia.org/wiki/Provinces_of_the_Netherlands) - New Zealand: Region (https://en.wikipedia.org/wiki/Regions_of_New_Zealand) - United Kingdom (England): Region (https://en.wikipedia.org/wiki/Regions_of_England) - United Kingdom (other): Country |
|
||||
| city | <code>string</code> | Local name of city or municipality. |
|
||||
| designation | <code>string</code> | Local name of `city` subset, administrative unit, university, or other institution if not `country`, `state`, or `city`. |
|
||||
| scope | <code>string</code> | Scope or type of the inventory (e.g. `tree`, `tree-street`, `tree-street-main`, `tree-park`, `tree-notable`). |
|
||||
| language | <code>string</code> | Language of contents as an [ISO 639-1](https://en.wikipedia.org/wiki/ISO_639-1) code (e.g. `en`) and an optional [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) region code (e.g. `en-AU`). |
|
||||
| primary | <code>string</code> | `id` of the primary source (for grouping sources together). |
|
||||
| long | <code>string</code> | Full name of the government body, university, or other institution (e.g. `City of Melbourne`). |
|
||||
@ -1668,127 +1659,6 @@ descending.
|
||||
| [options.maxDays] | <code>int</code> | Maximum age of result in days |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="geocode"></a>
|
||||
|
||||
## geocode(address) ⇒ <code>Promise.<object></code>
|
||||
Geocode address.
|
||||
|
||||
**Kind**: global function
|
||||
**Returns**: <code>Promise.<object></code> - Geocode results.
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| address | <code>string</code> | Address to geocode. |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="geocodeCached"></a>
|
||||
|
||||
## geocodeCached(address) ⇒ <code>Promise.<object></code>
|
||||
Geocode address with caching.
|
||||
|
||||
**Kind**: global function
|
||||
**Returns**: <code>Promise.<object></code> - Geocode results.
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| address | <code>string</code> | Address to geocode. |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="buildMapFromCrosswalks"></a>
|
||||
|
||||
## buildMapFromCrosswalks(crosswalks) ⇒ <code>Array.<Object.<string, Array.<string>>></code>
|
||||
Build source-target field name map from crosswalks.
|
||||
|
||||
**Kind**: global function
|
||||
**Returns**: <code>Array.<Object.<string, Array.<string>>></code> - Lowercased source field names
|
||||
mapped to each target field name.
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| crosswalks | <code>Array.<Object.<string, (string\|function())>></code> | Source crosswalks. |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="matchFieldName"></a>
|
||||
|
||||
## matchFieldName(name, map) ⇒ <code>Array.<Object></code>
|
||||
Find potential target field names matching a source field name.
|
||||
|
||||
**Kind**: global function
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| name | <code>string</code> | Source field name. |
|
||||
| map | <code>Object.<string, Array.<string>></code> | Target-source field name map. |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="buildGetCapabilitiesUrl"></a>
|
||||
|
||||
## buildGetCapabilitiesUrl(url) ⇒ <code>string</code>
|
||||
Build WFS GetCapabilities URL.
|
||||
|
||||
**Kind**: global function
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| url | <code>string</code> | WFS server URL |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="parseCapabilities"></a>
|
||||
|
||||
## parseCapabilities(xml) ⇒ <code>object</code>
|
||||
Parse WFS GetCapabilities response.
|
||||
|
||||
**Kind**: global function
|
||||
**Returns**: <code>object</code> - Parsed capabilities (version, outputFormats, typeNames,
|
||||
resultTypes, resultPaging).
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| xml | <code>string</code> | – XML string |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="chooseOutputFormat"></a>
|
||||
|
||||
## chooseOutputFormat(formats) ⇒ <code>string</code> \| <code>null</code>
|
||||
Choose the output format.
|
||||
|
||||
**Kind**: global function
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| formats | <code>Array.<string></code> | List of output formats |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="buildGetFeatureUrl"></a>
|
||||
|
||||
## buildGetFeatureUrl(url, capabilities, paging) ⇒ <code>string</code>
|
||||
Build WFS GetFeature URL.
|
||||
|
||||
**Kind**: global function
|
||||
|
||||
| Param | Type | Default | Description |
|
||||
| --- | --- | --- | --- |
|
||||
| url | <code>string</code> | | WFS server URL (ideally with typeName parameter) |
|
||||
| capabilities | <code>object</code> | | Server capabilities |
|
||||
| paging | <code>boolean</code> | <code>false</code> | Whether to set a start index and max feature count |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="getBrowser"></a>
|
||||
@ -1893,3 +1763,93 @@ Page is rendered in a headless browser (puppeteer) and saved as MHTML.
|
||||
|
||||
* * *
|
||||
|
||||
<a name="geocode"></a>
|
||||
|
||||
## geocode(address) ⇒ <code>Promise.<object></code>
|
||||
Geocode address.
|
||||
|
||||
**Kind**: global function
|
||||
**Returns**: <code>Promise.<object></code> - Geocode results.
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| address | <code>string</code> | Address to geocode. |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="geocodeCached"></a>
|
||||
|
||||
## geocodeCached(address) ⇒ <code>Promise.<object></code>
|
||||
Geocode address with caching.
|
||||
|
||||
**Kind**: global function
|
||||
**Returns**: <code>Promise.<object></code> - Geocode results.
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| address | <code>string</code> | Address to geocode. |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="buildGetCapabilitiesUrl"></a>
|
||||
|
||||
## buildGetCapabilitiesUrl(url) ⇒ <code>string</code>
|
||||
Build WFS GetCapabilities URL.
|
||||
|
||||
**Kind**: global function
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| url | <code>string</code> | WFS server URL |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="parseCapabilities"></a>
|
||||
|
||||
## parseCapabilities(xml) ⇒ <code>object</code>
|
||||
Parse WFS GetCapabilities response.
|
||||
|
||||
**Kind**: global function
|
||||
**Returns**: <code>object</code> - Parsed capabilities (version, outputFormats, typeNames,
|
||||
resultTypes, resultPaging).
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| xml | <code>string</code> | – XML string |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="chooseOutputFormat"></a>
|
||||
|
||||
## chooseOutputFormat(formats) ⇒ <code>string</code> \| <code>null</code>
|
||||
Choose the output format.
|
||||
|
||||
**Kind**: global function
|
||||
|
||||
| Param | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| formats | <code>Array.<string></code> | List of output formats |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
<a name="buildGetFeatureUrl"></a>
|
||||
|
||||
## buildGetFeatureUrl(url, capabilities, paging) ⇒ <code>string</code>
|
||||
Build WFS GetFeature URL.
|
||||
|
||||
**Kind**: global function
|
||||
|
||||
| Param | Type | Default | Description |
|
||||
| --- | --- | --- | --- |
|
||||
| url | <code>string</code> | | WFS server URL (ideally with typeName parameter) |
|
||||
| capabilities | <code>object</code> | | Server capabilities |
|
||||
| paging | <code>boolean</code> | <code>false</code> | Whether to set a start index and max feature count |
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
|
82
ARCHIVE.md
82
ARCHIVE.md
@ -1,82 +0,0 @@
|
||||
# Archiver (draft)
|
||||
|
||||
[`lib/archive.js`](/lib/archive.js) contains a set of functions that together implement a basic versioned data archive. The sections below provide simple usage examples.
|
||||
|
||||
## Archive a web page
|
||||
|
||||
Uses [Puppeteer](https://pptr.dev) to render web pages with a headless brower (Chromium).
|
||||
|
||||
```js
|
||||
const puppeteer = require('puppeteer')
|
||||
const archive = require('./lib/archive')
|
||||
|
||||
URL = 'https://data.sa.gov.au/data/dataset/street-trees'
|
||||
DATE = new Date()
|
||||
BASENAME = 'response'
|
||||
|
||||
// Open a new browser page
|
||||
browser = await puppeteer.launch()
|
||||
page = await browser.newPage()
|
||||
|
||||
// Navigate the page to a URL
|
||||
response = await archive.loadPage(URL, page)
|
||||
|
||||
if (response.status() < 300) {
|
||||
// Save and log HTML
|
||||
html = await archive.readPageHtml(page)
|
||||
archive.logData({
|
||||
data: html,
|
||||
filename: `${BASENAME}.html`,
|
||||
url: URL,
|
||||
date: DATE,
|
||||
type: 'page',
|
||||
format: 'html',
|
||||
status: response.status(),
|
||||
headers: response.headers(),
|
||||
})
|
||||
// Save and log MHTML
|
||||
mhtml = await archive.readPageMhtml(page)
|
||||
archive.logData({
|
||||
data: mhtml,
|
||||
filename: `${BASENAME}.mhtml`,
|
||||
url: URL,
|
||||
date: DATE,
|
||||
type: 'page',
|
||||
format: 'mhtml',
|
||||
status: response.status(),
|
||||
headers: response.headers(),
|
||||
})
|
||||
} else {
|
||||
// Log error
|
||||
archive.log({
|
||||
url: URL,
|
||||
date: DATE,
|
||||
type: 'page',
|
||||
status: response.status(),
|
||||
headers: response.headers(),
|
||||
})
|
||||
}
|
||||
|
||||
// Close browser
|
||||
await browser.close()
|
||||
```
|
||||
|
||||
## Archive a file
|
||||
|
||||
See the functions in [`lib/workflow.js`](/lib/workflow.js).
|
||||
|
||||
```js
|
||||
const workflow = require('./lib/workflow')
|
||||
|
||||
// Download a remote file
|
||||
await workflow.downloadFile({
|
||||
url: 'https://path/to/remote/file',
|
||||
})
|
||||
|
||||
// Register an existing local file
|
||||
await workflow.registerFile({
|
||||
file: '/path/to/local/file',
|
||||
url: 'https://original/path/to/remote/file',
|
||||
type: 'data'
|
||||
})
|
||||
```
|
101
README.md
101
README.md
@ -5,8 +5,9 @@ Authors: Ethan Welty ([fallingfruit.org](https://fallingfruit.org)), Steve Benne
|
||||
Scripts that fetch and process data about inventoried trees and other plants from government and university open data sources. The result is used, among other things, to populate [opentrees.org](https://opentrees.org).
|
||||
|
||||
- [Installation](#installation)
|
||||
- [Overview](#overview)
|
||||
- [Usage](#usage)
|
||||
- [Target Schema](#target-schema)
|
||||
- [Target schema](#target-schema)
|
||||
- [Development](#development)
|
||||
|
||||
## Installation
|
||||
@ -17,24 +18,49 @@ cd opentrees-harvester
|
||||
yarn
|
||||
```
|
||||
|
||||
Copy `.env.example` to `.env` and set the environment variables as needed.
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
- `DATA_ARCHIVE` (default `archive/data`): Directory of the data archive. See [Caching](#caching).
|
||||
- `DATA_ARCHIVE_LOG` (default `archive/data.jsonl`): Log file of the data archive. See [Caching](#caching).
|
||||
- `GEOCODE_ARCHIVE` (default `archive/geocode`): Directory of the geocode archive. Address geocode results are stored as JSON in files with paths of the form `{address_hash}.json`.
|
||||
- `GOOGLE_MAPS_API_KEY`: Google Maps API key for geocoding addresses.
|
||||
|
||||
## Overview
|
||||
|
||||
### Sources ([`sources.js`](sources.js))
|
||||
|
||||
Each source dataset is described as a Javascript object (see [`SourceProperties`](API.md#module_types..SourceProperties)) in a single giant array sorted nominally by `country`, `state`, `city`, `designation`, and `scope`. A schema `crosswalk` strives to map the source dataset to our [target schema](#target-schema).
|
||||
|
||||
### Downloading
|
||||
|
||||
The harvester downloads source `data`, `metadata`, and `license` from URLs using the specified [`DownloadMethod`](API.md#module_types..DownloadMethod), which includes file-based download, querying the ArcGIS Feature Layer API, or rendering the URL in a web browser.
|
||||
|
||||
### Caching
|
||||
|
||||
The harvester aggressively caches source data and metadata in order to avoid re-downloading files, track changes of files over time, and protect against the inevitable link rot. Files are stored in the archive (`DATA_ARCHIVE` environment variable) with paths of the form `{hash}/{timestamp}/{filename}`, where `hash` is either the checksum of the source URL (if downloaded from a URL) or the checksum of the file content (if not). Archived files are described in the log file (`DATA_ARCHIVE_LOG` environment variable), a [JSON Lines](http://jsonlines.org) file that records file path, content checksum, date, and other file properties (see [`ArchiveEntry`](API.md#module_types..ArchiveEntry)).
|
||||
|
||||
### Processing
|
||||
|
||||
Downloading, caching, and finally data processing are typically executed via the [`Source`](API.md#module_source..Source) class, which wraps source properties and provides methods for each step in the pipeline. Processing includes reading the source data with [GDAL](https://gdal.org), applying the schema crosswalk, and writing the result to a new file.
|
||||
|
||||
## Usage
|
||||
|
||||
### Source properties ([`sources/*.js`](sources))
|
||||
|
||||
Each source dataset is described as a Javascript `object` following the format described at [`API.md#SourceProperties`](API.md#SourceProperties). They are sorted into modules organized by country. The schema crosswalks (`crosswalk` properties) strive to map each source dataset to our [target schema](#target-schema).
|
||||
|
||||
### Command line interface ([`cli/*.js`](cli))
|
||||
|
||||
The command line interface provides a quick way to process all or a subset of the source datasets. See each command's help message:
|
||||
The command line interface provides a quick way to download and process all or a subset of the source datasets. See each command's help message:
|
||||
|
||||
```bash
|
||||
yarn get -h
|
||||
yarn process -h
|
||||
yarn get --help
|
||||
yarn process --help
|
||||
```
|
||||
|
||||
### Source class ([`lib/source.js`](lib/source.js))
|
||||
|
||||
The `Source` class wraps source properties to facilitate data processing. All methods are documented at [`API.md`](API.md#module_source..Source).
|
||||
The [`Source`](API.md#module_source..Source) class wraps source properties to facilitate data processing.
|
||||
|
||||
Here is a simple example using the included [`tests/simple.csv`](tests/simple.csv):
|
||||
|
||||
@ -44,7 +70,7 @@ const Source = require('./lib/source')
|
||||
const source = new Source(
|
||||
props = {
|
||||
id: 'test',
|
||||
download: 'https://raw.githubusercontent.com/ezwelty/opentrees-harvester/main/tests/simple.csv',
|
||||
data: 'https://raw.githubusercontent.com/ezwelty/opentrees-harvester/main/tests/simple.csv',
|
||||
geometry: { x: 'LON', y: 'LAT' },
|
||||
srs: 'EPSG:4326',
|
||||
crosswalk: {
|
||||
@ -52,26 +78,35 @@ const source = new Source(
|
||||
common: x => x['NAME'].toLowerCase(),
|
||||
height_cm: 'HEIGHT_CM'
|
||||
}
|
||||
},
|
||||
dir = 'test/input'
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
Use [`Source.get()`](API.md/#module_source..Source+get) to download remote files (`source.props.download`) to the source directory (`source.dir`) and prepare them for processing.
|
||||
Use [`Source.findFiles()`](API.md#module_source..Source+findFiles) to download the remote data file (`source.props.data`) to the archive.
|
||||
|
||||
```js
|
||||
source.get()
|
||||
// Promise { <pending> }
|
||||
// [test] Downloading simple.csv
|
||||
// [test] Downloaded simple.csv (0 MB)
|
||||
// [test] Ready to process
|
||||
await source.fetchFiles('data') // 'data' (default), 'metadata', or 'license'
|
||||
// [
|
||||
// {
|
||||
// date: 2024-09-24T20:41:22.507Z,
|
||||
// url: 'https://raw.githubusercontent.com/ezwelty/opentrees-harvester/main/tests/simple.csv',
|
||||
// method: 'file',
|
||||
// checksum: '7303b0bda0ca68c7db73922af340e4aa',
|
||||
// path: 'archive/data/d60579b4f36793bb54f6f4790bd683a2/2024-09-24T204122.507Z/simple.csv.txt',
|
||||
// props: { type: 'data' }
|
||||
// }
|
||||
// ]
|
||||
```
|
||||
|
||||
Optionally, use [`Source.find()`](API.md/#module_source..Source+find) to check that we downloaded a file recognized by GDAL, then [`Source.getRows()`](API.md/#module_source..Source+getRows) (or `Source.getFields()`, `Source.sample()`, `Source.glimpse()`, etc) to read content from the file with GDAL.
|
||||
Optionally, use [`Source.findFiles()`](API.md#module_source..Source+findFiles) to retrieve them from the archive without downloading them. The output would be the same as above.
|
||||
|
||||
We can now open the dataset with GDAL, then use one of the many methods to inspect it ([`Source.getRows()`](API.md#module_source..Source+getRows), [`Source.glimpse()`](API.md#module_source..Source+glimpse), etc).
|
||||
|
||||
```js
|
||||
source.find()
|
||||
// 'test/input/simple.csv'
|
||||
// Note: We need to set the GDAL driver exlicitly because the data was downloaded as
|
||||
// '.csv.txt' instead of '.csv'.
|
||||
source.props.driver = 'CSV'
|
||||
await source.open()
|
||||
source.getRows(1)
|
||||
// [
|
||||
// {
|
||||
@ -84,13 +119,12 @@ source.getRows(1)
|
||||
// ]
|
||||
```
|
||||
|
||||
Use [`Source.process()`](API.md/#module_source..Source+process) to process the input and write the result to a new file. In this case, this includes (1) writing a [VRT file](https://gdal.org/drivers/vector/vrt.html) to tell [GDAL](https://gdal.org) which spatial reference system and geometry field names to use when reading the input and (2) applying our schema crosswalk (`source.props.crosswalk`).
|
||||
Use [`Source.process()`](API.md/#module_source..Source+process) to process the input and write the result to a new file. In this case, this includes applying our schema crosswalk (`source.props.crosswalk`).
|
||||
|
||||
```js
|
||||
source.process('test/output/output.csv')
|
||||
// [test] Processing test/input/simple.csv
|
||||
// [test] Writing and reading VRT file
|
||||
// [test] Wrote output: test/output/output.csv
|
||||
await source.process('output/test.csv', { overwrite: true })
|
||||
// [test] Processing CSV:archive/data/d60579b4f36793bb54f6f4790bd683a2/2024-09-24T204122.507Z/simple.csv.txt
|
||||
// [test] Wrote output: output/test.csv
|
||||
```
|
||||
|
||||
We can modify the crosswalk following our conventions to apply unit conversions and other cleaning steps (see [`lib/convert.js`](lib/convert.js)). In this case, `height_cm` (in centimeters) is automatically converted to standard `height` (in meters).
|
||||
@ -98,17 +132,7 @@ We can modify the crosswalk following our conventions to apply unit conversions
|
||||
```js
|
||||
const { modifyCrosswalk } = require('./lib/convert.js')
|
||||
source.props.crosswalk = modifyCrosswalk(source.props.crosswalk)
|
||||
source.process('test/output/output.csv', {overwrite: true})
|
||||
```
|
||||
|
||||
Finally, the result can also be inspected using the `Source` class.
|
||||
|
||||
```js
|
||||
const out = new Source({id: 'out'}, 'test/output')
|
||||
out.find()
|
||||
// 'test/output/output.csv'
|
||||
out.getRows(1)
|
||||
// [ { ref: '1', common: 'loquat', height: '12' } ]
|
||||
await source.process('output/test.csv', { overwrite: true })
|
||||
```
|
||||
|
||||
### Scientific name matching
|
||||
@ -266,8 +290,9 @@ Numeric and date ranges use the field name suffixes `_min` and `_max`. For examp
|
||||
|
||||
## Development
|
||||
|
||||
The source code is documented using inline [JSDoc 3](https://jsdoc.app/) comments. Update the API documentation ([API.md](API.md)) from the source code by running:
|
||||
The source code is documented using inline [JSDoc 3](https://jsdoc.app) comments. Update the API documentation ([API.md](API.md)) from the source code by running:
|
||||
|
||||
```bash
|
||||
yarn test
|
||||
yarn docs
|
||||
```
|
||||
|
@ -1,20 +1,34 @@
|
||||
const DEFAULT_OPTIONS = [
|
||||
{
|
||||
name: 'help', alias: 'h', type: Boolean, defaultValue: false
|
||||
name: 'help', type: Boolean, defaultValue: false
|
||||
},
|
||||
{
|
||||
name: 'ids', alias: 'i', type: String, multiple: true, defaultOption: true,
|
||||
description: 'Restrict to these source identifiers.'
|
||||
name: 'id', type: String, multiple: true,
|
||||
description: 'Restrict by id.\nNote: These are currently assigned automatically on load based on source properties and are thus subject to change.'
|
||||
},
|
||||
{
|
||||
name: 'countries', alias: 'c', type: String, multiple: true,
|
||||
description: 'Restrict to these source countries (case and whitespace insensitive).'
|
||||
name: 'country', type: String, multiple: true,
|
||||
description: 'Restrict by country.'
|
||||
},
|
||||
{
|
||||
name: 'dir', alias: 'd', type: String, defaultValue: 'data/${id}/input',
|
||||
// Escape special characters for chalk. See https://github.com/Polymer/tools/pull/612
|
||||
description: "Template for input directory, with source properties referred to by name (default: 'data/${id}/input').".
|
||||
replace(/[{}\\]/g, '\\$&')
|
||||
name: 'city', type: String, multiple: true,
|
||||
description: 'Restrict by city.'
|
||||
},
|
||||
{
|
||||
name: 'state', type: String, multiple: true,
|
||||
description: 'Restrict by state.'
|
||||
},
|
||||
{
|
||||
name: 'designation', type: String, multiple: true,
|
||||
description: 'Restrict by designation.'
|
||||
},
|
||||
{
|
||||
name: 'scope', type: String, multiple: true,
|
||||
description: 'Restrict by scope.'
|
||||
},
|
||||
{
|
||||
name: 'omit', type: Boolean, defaultValue: false,
|
||||
description: 'Whether to keep sources flagged as `omit: true`.'
|
||||
}
|
||||
]
|
||||
|
||||
|
70
cli/get.js
70
cli/get.js
@ -2,20 +2,31 @@
|
||||
const commandLineUsage = require('command-line-usage')
|
||||
const commandLineArgs = require('command-line-args')
|
||||
const { DEFAULT_OPTIONS } = require('./common')
|
||||
const { loadSources } = require('../lib/load')
|
||||
const { loadSources } = require('../lib/sourceio')
|
||||
|
||||
const OPTIONS = [
|
||||
...DEFAULT_OPTIONS,
|
||||
{
|
||||
name: 'force', alias: 'f', type: Boolean, defaultValue: false,
|
||||
description: 'Overwrite input directory even if it is not empty.'
|
||||
name: 'max-days', type: Number, defaultValue: null,
|
||||
description: 'Maximum age (in days) of archived file (if older, re-download).'
|
||||
},
|
||||
{
|
||||
name: 'type', type: String, defaultValue: 'data',
|
||||
description: 'Type of file to download (data, metadata, license).'
|
||||
},
|
||||
{
|
||||
name: 'format', type: String, defaultValue: 'pdf',
|
||||
description: 'Format of browser download (pdf, png, mhtml, html).'
|
||||
}
|
||||
]
|
||||
|
||||
const USAGE = [
|
||||
{
|
||||
header: 'example/get.js',
|
||||
content: 'Download remote files, unpack compressed or archive files, and execute shell commands to prepare source files for processing.'
|
||||
header: 'get.js',
|
||||
content: (
|
||||
'Download files if missing or older than a maximum age in the archive.\n' +
|
||||
'Source filters are case and whitespace insensitive.'
|
||||
)
|
||||
},
|
||||
{
|
||||
header: 'Options',
|
||||
@ -26,7 +37,7 @@ const USAGE = [
|
||||
// Parse command line arguments
|
||||
let options
|
||||
try {
|
||||
options = commandLineArgs(OPTIONS)
|
||||
options = commandLineArgs(OPTIONS, { camelCase: true })
|
||||
if (options.help) {
|
||||
console.log(commandLineUsage(USAGE))
|
||||
process.exit(0)
|
||||
@ -38,48 +49,53 @@ try {
|
||||
}
|
||||
|
||||
// Load sources
|
||||
const sources = loadSources(
|
||||
`${__dirname}/../sources`,
|
||||
{ ids: options.ids, countries: options.countries },
|
||||
options.dir
|
||||
)
|
||||
const filters = {}
|
||||
const filterKeys = ['id', 'country', 'city', 'state', 'designation', 'scope', 'omit']
|
||||
filterKeys.forEach(key => filters[key] = options[key])
|
||||
const sources = loadSources(`${__dirname}/../sources.js`, filters)
|
||||
|
||||
// Get sources
|
||||
const success = []
|
||||
const failure = []
|
||||
const skip = []
|
||||
async function getSource(source) {
|
||||
const searchOptions = { format: options.format, maxDays: options.maxDays }
|
||||
// Check if all files already exist
|
||||
try {
|
||||
const paths = await source.get(options.force)
|
||||
if (paths.length) {
|
||||
success.push(source.props.id)
|
||||
} else {
|
||||
skip.push(source.props.id)
|
||||
}
|
||||
await source.findFiles(options.type, searchOptions)
|
||||
skip.push(source.props.id)
|
||||
return
|
||||
} catch (error) {
|
||||
console.error(error.message)
|
||||
failure.push(source.props.id)
|
||||
// Download files
|
||||
try {
|
||||
await source.fetchFiles(options.type, searchOptions)
|
||||
success.push(source.props.id)
|
||||
} catch (error) {
|
||||
console.error(error.message)
|
||||
failure.push(source.props.id)
|
||||
}
|
||||
}
|
||||
}
|
||||
async function get() {
|
||||
async function getSources() {
|
||||
console.log(`Found ${sources.length} sources`)
|
||||
await Promise.all(sources.map(source => getSource(source)))
|
||||
if (success.length) {
|
||||
console.log(
|
||||
`${'[SUCCESS]'.green} Got ${success.length} sources:`,
|
||||
success.join(', ')
|
||||
`${'[SUCCESS]'.green} Downloaded ${success.length} sources:\n\n` +
|
||||
success.join('\n')
|
||||
)
|
||||
}
|
||||
if (failure.length) {
|
||||
console.error(
|
||||
`${'[ERROR]'.red} Failed to get ${failure.length} sources:`,
|
||||
failure.join(', ')
|
||||
`${'[ERROR]'.red} Failed to download ${failure.length} sources:\n\n` +
|
||||
`${failure.join('\n')}`
|
||||
)
|
||||
}
|
||||
if (skip.length) {
|
||||
console.log(
|
||||
`${'[SKIPPED]'.dim} Skipped ${skip.length} sources:`,
|
||||
skip.join(', ')
|
||||
`${'[SKIPPED]'.dim} ${skip.length} sources already in the archive (or empty):\n\n` +
|
||||
skip.join('\n')
|
||||
)
|
||||
}
|
||||
}
|
||||
get()
|
||||
getSources()
|
||||
|
110
cli/process.js
110
cli/process.js
@ -3,48 +3,62 @@ const colors = require('colors')
|
||||
const commandLineUsage = require('command-line-usage')
|
||||
const commandLineArgs = require('command-line-args')
|
||||
const { DEFAULT_OPTIONS } = require('./common')
|
||||
const { loadSources } = require('../lib/load')
|
||||
const { deleteFeature } = require('../lib/clean')
|
||||
const { loadSources } = require('../lib/sourceio')
|
||||
const { interpolateString } = require('../lib/helpers')
|
||||
|
||||
const OPTIONS = [
|
||||
...DEFAULT_OPTIONS,
|
||||
{
|
||||
name: 'out', alias: 'o', type: String, defaultValue: 'data/${id}/output/output.csv',
|
||||
name: 'file', type: String, defaultValue: 'output/${id}.csv',
|
||||
// Escape special characters for chalk. See https://github.com/Polymer/tools/pull/612
|
||||
description: "Template for output file, with source properties referred to by name (default: 'data/${id}/output/output.csv').".
|
||||
description: "Template for output file path, with source properties referred to by name (default: 'output/${id}.csv').".
|
||||
replace(/[{}\\]/g, '\\$&')
|
||||
},
|
||||
{
|
||||
name: 'driver', type: String,
|
||||
description: (
|
||||
'Name of GDAL driver to use for output (see https://gdal.org/drivers/vector). ' +
|
||||
'Guessed from file extension if not provided.'
|
||||
)
|
||||
},
|
||||
{
|
||||
name: 'creation', type: String, multiple: true,
|
||||
defaultValue: ['GEOMETRY=AS_WKT', 'STRING_QUOTING=IF_NEEDED'],
|
||||
description: 'Driver-specific dataset creation options (see https://gdal.org/drivers/vector).'
|
||||
},
|
||||
{
|
||||
name: 'overwrite', type: Boolean, defaultValue: false,
|
||||
description: 'Overwrite output file even if it already exists.'
|
||||
},
|
||||
{
|
||||
name: 'centroids', type: Boolean, defaultValue: false,
|
||||
description: 'Whether to reduce non-point geometries to centroids.'
|
||||
},
|
||||
{
|
||||
name: 'keepInvalid', type: Boolean, defaultValue: false,
|
||||
name: 'keep-invalid', type: Boolean, defaultValue: false,
|
||||
description: 'Whether to keep features with empty or invalid geometries.'
|
||||
},
|
||||
{
|
||||
name: 'keepFields', type: Boolean, defaultValue: false,
|
||||
name: 'keep-fields', type: Boolean, defaultValue: false,
|
||||
description: 'Whether to keep the input feature fields alongside the result of the schema crosswalk.'
|
||||
},
|
||||
{
|
||||
name: 'prefix', type: String, defaultValue: '',
|
||||
description: 'String to append to input field names to prevent collisions with output field names. Applies only with `keepFields`.'
|
||||
description: 'String to append to input field names to prevent collisions with output field names. Applies only with `keep-fields`.'
|
||||
},
|
||||
{
|
||||
name: 'bounds', type: Number, multiple: true,
|
||||
description: 'Bounding box in the format [xmin, ymin, xmax, ymax] (EPSG:4326). If provided, features outside the bounds are skipped.'
|
||||
},
|
||||
{
|
||||
name: 'force', alias: 'f', type: Boolean, defaultValue: false,
|
||||
description: 'Overwrite output file even if it already exists.'
|
||||
}
|
||||
]
|
||||
|
||||
const USAGE = [
|
||||
{
|
||||
header: 'example/process.js',
|
||||
content: 'Process sources and save as new vector files.',
|
||||
header: 'process.js',
|
||||
content: (
|
||||
'Process sources and save as new vector files.\n' +
|
||||
'Source filters are case and whitespace insensitive.'
|
||||
)
|
||||
},
|
||||
{
|
||||
header: 'Options',
|
||||
@ -55,7 +69,7 @@ const USAGE = [
|
||||
// Parse command line arguments
|
||||
let options
|
||||
try {
|
||||
options = commandLineArgs(OPTIONS)
|
||||
options = commandLineArgs(OPTIONS, { camelCase: true })
|
||||
if (options.help) {
|
||||
console.log(commandLineUsage(USAGE))
|
||||
process.exit(0)
|
||||
@ -67,52 +81,60 @@ try {
|
||||
}
|
||||
|
||||
// Load sources
|
||||
const sources = loadSources(
|
||||
`${__dirname}/../sources`,
|
||||
{ ids: options.ids, countries: options.countries },
|
||||
options.dir
|
||||
)
|
||||
const filters = {}
|
||||
const filterKeys = ['id', 'country', 'city', 'state', 'designation', 'scope', 'omit']
|
||||
filterKeys.forEach(key => filters[key] = options[key])
|
||||
const sources = loadSources(`${__dirname}/../sources.js`, filters)
|
||||
|
||||
// Process sources
|
||||
const success = []
|
||||
const failure = []
|
||||
const skip = []
|
||||
const processOptions = {
|
||||
overwrite: options.force,
|
||||
driver: options.driver,
|
||||
creation: options.creation,
|
||||
overwrite: options.overwrite,
|
||||
centroids: options.centroids,
|
||||
keepInvalid: options.keepInvalid,
|
||||
keepFields: options.keepFields,
|
||||
prefix: options.prefix,
|
||||
bounds: options.bounds,
|
||||
deleteFunc: deleteFeature
|
||||
}
|
||||
sources.forEach(source => {
|
||||
const file = interpolateString(options.out, source.props)
|
||||
async function processSource(source) {
|
||||
const file = interpolateString(options.file, source.props)
|
||||
try {
|
||||
const result = source.process(file, processOptions)
|
||||
if (result) success.push(source.props.id)
|
||||
else skip.push(source.props.id)
|
||||
const result = await source.process(file, processOptions)
|
||||
if (result) {
|
||||
success.push(source.props.id)
|
||||
} else {
|
||||
skip.push(source.props.id)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error.message)
|
||||
failure.push(source.props.id)
|
||||
}
|
||||
})
|
||||
|
||||
if (success.length) {
|
||||
console.log(
|
||||
`${'[SUCCESS]'.green} Processed ${success.length} sources:`,
|
||||
success.join(', ')
|
||||
)
|
||||
}
|
||||
if (failure.length) {
|
||||
console.error(
|
||||
`${'[ERROR]'.red} Failed to process ${failure.length} sources:`,
|
||||
failure.join(', ')
|
||||
)
|
||||
}
|
||||
if (skip.length) {
|
||||
console.log(
|
||||
`${'[SKIPPED]'.dim} Skipped ${skip.length} sources:`,
|
||||
skip.join(', ')
|
||||
)
|
||||
async function processSources() {
|
||||
for (const source of sources) {
|
||||
await processSource(source)
|
||||
}
|
||||
if (success.length) {
|
||||
console.log(
|
||||
`${'[SUCCESS]'.green} Processed ${success.length} sources:\n\n` +
|
||||
success.join('\n')
|
||||
)
|
||||
}
|
||||
if (failure.length) {
|
||||
console.error(
|
||||
`${'[ERROR]'.red} Failed to process ${failure.length} sources:\n\n` +
|
||||
failure.join('\n')
|
||||
)
|
||||
}
|
||||
if (skip.length) {
|
||||
console.log(
|
||||
`${'[SKIPPED]'.dim} ${skip.length} sources already existed:\n\n` +
|
||||
skip.join('\n')
|
||||
)
|
||||
}
|
||||
}
|
||||
processSources()
|
||||
|
@ -1,4 +1,5 @@
|
||||
require('dotenv').config()
|
||||
const path = require('path')
|
||||
const fs = require('fs')
|
||||
const mime = require('mime-types')
|
||||
const puppeteer = require('puppeteer')
|
||||
|
88
lib/load.js
88
lib/load.js
@ -1,88 +0,0 @@
|
||||
/**
|
||||
* Load the provided source datasets.
|
||||
*
|
||||
* @module
|
||||
*/
|
||||
|
||||
const { resolve } = require('path')
|
||||
const glob = require('glob')
|
||||
const colors = require('colors')
|
||||
const Source = require('./source')
|
||||
const { interpolateString, reduceString } = require('./helpers')
|
||||
const { modifyCrosswalk } = require('./convert')
|
||||
|
||||
/**
|
||||
* Load sources from source properties.
|
||||
*
|
||||
* @param {string} path - Directory of JS files containing source properties.
|
||||
* @param {object} [filters={}]
|
||||
* @param {string[]} filters.ids - Return only sources with these identifiers.
|
||||
* @param {string[]} filters.countries - Return only source with these countries.
|
||||
* @param {string} [dir=data/${id}/input] - Source input directory (template
|
||||
* interpolated on source properties).
|
||||
* @returns {Source[]}
|
||||
*/
|
||||
function loadSources(path, filters = {}, dir = 'data/${id}/input') {
|
||||
// Load source properties
|
||||
const globOptions = { absolute: true, nodir: true, cwd: resolve(path) }
|
||||
var sourceProps = glob.sync('**/*.js', globOptions).
|
||||
map(file => require(file)).
|
||||
flat()
|
||||
// Ensure that source identifiers are unique
|
||||
const all = sourceProps.map(props => props.id)
|
||||
const duplicated = all.filter((item, index) => all.indexOf(item) != index)
|
||||
if (duplicated.length) {
|
||||
throw new Error(
|
||||
`Duplicate source identifiers: ${[...new Set(duplicated)].join(', ')}`)
|
||||
}
|
||||
// Filter source properties
|
||||
sourceProps = sourceProps.filter(props => {
|
||||
return (
|
||||
(
|
||||
!filters.ids ||
|
||||
filters.ids.map(x => reduceString(x)).includes(reduceString(props.id))
|
||||
) &&
|
||||
(
|
||||
!filters.countries ||
|
||||
filters.countries.map(x => reduceString(x)).includes(reduceString(props.country))
|
||||
)
|
||||
)
|
||||
})
|
||||
const sources = []
|
||||
const invalid = []
|
||||
for (const props of sourceProps) {
|
||||
let input
|
||||
let source
|
||||
try {
|
||||
// Modify crosswalk for unit conversions and range parsing
|
||||
props.crosswalk = modifyCrosswalk(props.crosswalk)
|
||||
input = interpolateString(dir, props)
|
||||
} catch (error) {
|
||||
const tag = `[${props.id}]`.red
|
||||
console.error(`${tag} ${error.message}`)
|
||||
invalid.push(props.id)
|
||||
continue
|
||||
}
|
||||
try {
|
||||
// Convert to Source class
|
||||
source = new Source(props, input)
|
||||
} catch (error) {
|
||||
console.error(error.message)
|
||||
invalid.push(props.id)
|
||||
continue
|
||||
}
|
||||
sources.push(source)
|
||||
}
|
||||
// Report failures
|
||||
if (invalid.length) {
|
||||
console.error(
|
||||
`${'[ERROR]'.red} Skipped ${invalid.length} invalid source(s):`,
|
||||
invalid.join(', ')
|
||||
)
|
||||
}
|
||||
return sources
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
loadSources
|
||||
}
|
@ -1,3 +1,10 @@
|
||||
/**
|
||||
* Match source column names to target schema based on existing crosswalks.
|
||||
*
|
||||
* @module
|
||||
* @private
|
||||
*/
|
||||
|
||||
const { parseFieldName } = require('./convert.js')
|
||||
|
||||
/**
|
||||
@ -5,7 +12,7 @@ const { parseFieldName } = require('./convert.js')
|
||||
*
|
||||
* Assumes that the function accesses properties of an object named `x`.
|
||||
*
|
||||
* @param {*} func - Function whose source to extract field names from.
|
||||
* @param {function} func - Function whose source to extract field names from.
|
||||
* @returns {string[]} Field names accessed by the function.
|
||||
* @private
|
||||
* @example
|
||||
@ -107,7 +114,7 @@ function matchFieldName(name, map) {
|
||||
*
|
||||
* @param {string[]} names - Source field names.
|
||||
* @param {Object.<string, string[]>} map - Target-source field name map.
|
||||
* @returns {Array[Object.<string, Object[]>, string[]]} Target field names mapped to
|
||||
* @returns {Array} Target field names mapped to
|
||||
* source field `name`, result `count`, and number of target field name `matches`
|
||||
* (if greater than 1), followed by unmatched source field names.
|
||||
* @example
|
||||
@ -156,7 +163,7 @@ function guessCrosswalk(names, map) {
|
||||
/**
|
||||
* Print crosswalk guess to console.
|
||||
*
|
||||
* @param {Array[Object.<string, Object[]>, string[]]} guess - Crosswalk guess.
|
||||
* @param {Array} guess - Crosswalk guess.
|
||||
* @example
|
||||
* guess = guessCrosswalk(['foz', 'foo', 'zzz'], {bar: ['foo', 'foo', 'foz'], baz: ['foo']})
|
||||
* printCrosswalkGuess(guess)
|
||||
|
@ -12,7 +12,7 @@ const gdal = require('gdal-async')
|
||||
const { table } = require('table')
|
||||
const helpers = require('./helpers')
|
||||
const archive = require('./archive')
|
||||
const workflow = require('./workflow')
|
||||
const workflow = require('./archiveWorkflow')
|
||||
const LICENSES = require('./licenses')
|
||||
const {ArchiveFile, ArchiveEntry, SourceProperties, BrowserFormat, FileType, SourceFile} = require('./types')
|
||||
|
||||
@ -584,55 +584,57 @@ class Source {
|
||||
}
|
||||
outputFeature.fields.set(outputFields)
|
||||
// Geometry
|
||||
let inputGeometry = this.getFeatureGeometry(
|
||||
inputFeature, { fields: inputFields, srs, isXY }
|
||||
)
|
||||
let isValid = false
|
||||
if (inputGeometry) {
|
||||
// Reduce to centroid if needed
|
||||
if (options.centroids && inputGeometry.wkbType != gdal.wkbPoint) {
|
||||
inputGeometry = inputGeometry.centroid()
|
||||
// HACK: Centroid sometimes loses SRS
|
||||
inputGeometry.srs = srs
|
||||
}
|
||||
// Check if geometry is valid
|
||||
isValid = inputGeometry.isValid()
|
||||
const isPoint = inputGeometry.wkbType == gdal.wkbPoint
|
||||
if (isPoint) {
|
||||
isValid = Boolean(
|
||||
isValid &&
|
||||
inputGeometry.x &&
|
||||
inputGeometry.y &&
|
||||
isFinite(inputGeometry.x) &&
|
||||
isFinite(inputGeometry.y)
|
||||
)
|
||||
}
|
||||
// Transform geometry
|
||||
if (isValid && transform) {
|
||||
try {
|
||||
inputGeometry.transform(transform)
|
||||
} catch (error) {
|
||||
isValid = false
|
||||
if (outputLayer.geomType != gdal.wkbNone) {
|
||||
let inputGeometry = this.getFeatureGeometry(
|
||||
inputFeature, { fields: inputFields, srs, isXY }
|
||||
)
|
||||
let isValid = false
|
||||
if (inputGeometry) {
|
||||
// Reduce to centroid if needed
|
||||
if (options.centroids && inputGeometry.wkbType != gdal.wkbPoint) {
|
||||
inputGeometry = inputGeometry.centroid()
|
||||
// HACK: Centroid sometimes loses SRS
|
||||
inputGeometry.srs = srs
|
||||
}
|
||||
// Check if geometry is valid
|
||||
isValid = inputGeometry.isValid()
|
||||
const isPoint = inputGeometry.wkbType == gdal.wkbPoint
|
||||
if (isPoint) {
|
||||
isValid = Boolean(
|
||||
isValid &&
|
||||
inputGeometry.x &&
|
||||
inputGeometry.y &&
|
||||
isFinite(inputGeometry.x) &&
|
||||
isFinite(inputGeometry.y)
|
||||
)
|
||||
}
|
||||
// Transform geometry
|
||||
if (isValid && transform) {
|
||||
try {
|
||||
inputGeometry.transform(transform)
|
||||
} catch (error) {
|
||||
isValid = false
|
||||
}
|
||||
}
|
||||
if (isValid && options.bounds) {
|
||||
if (!inputGeometry.within(options.bounds)) {
|
||||
counts.outOfBoundGeometries++
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isValid && options.bounds) {
|
||||
if (!inputGeometry.within(options.bounds)) {
|
||||
counts.outOfBoundGeometries++
|
||||
if (!isValid) {
|
||||
counts.invalidGeometries++
|
||||
if (!options.keepInvalid) {
|
||||
continue
|
||||
}
|
||||
inputGeometry = null
|
||||
}
|
||||
}
|
||||
if (!isValid) {
|
||||
counts.invalidGeometries++
|
||||
if (!options.keepInvalid) {
|
||||
continue
|
||||
if (inputGeometry) {
|
||||
// Flatten 3D geometries to 2D
|
||||
inputGeometry.flattenTo2D()
|
||||
outputFeature.setGeometry(inputGeometry)
|
||||
}
|
||||
inputGeometry = null
|
||||
}
|
||||
if (inputGeometry) {
|
||||
// Flatten 3D geometries to 2D
|
||||
inputGeometry.flattenTo2D()
|
||||
outputFeature.setGeometry(inputGeometry)
|
||||
}
|
||||
outputLayer.features.add(outputFeature)
|
||||
}
|
||||
|
149
lib/sourceio.js
149
lib/sourceio.js
@ -1,12 +1,15 @@
|
||||
/**
|
||||
* Read and write source properties.
|
||||
* Read and write sources and source properties.
|
||||
*
|
||||
* @module
|
||||
*/
|
||||
const path = require('path')
|
||||
const util = require('util')
|
||||
const fs = require('fs')
|
||||
|
||||
const colors = require('colors')
|
||||
const Source = require('./source')
|
||||
const { reduceString } = require('./helpers')
|
||||
const { modifyCrosswalk } = require('./convert')
|
||||
const {SourceProperties} = require('./types')
|
||||
|
||||
/**
|
||||
@ -24,18 +27,18 @@ function readSourceProperties(file) {
|
||||
/**
|
||||
* Write source properties to a file.
|
||||
*
|
||||
* @param {SourceProperties[]} sources - Source properties.
|
||||
* @param {SourceProperties[]} sourceProps - Source properties.
|
||||
* @param {string} file - Path to new source properties file.
|
||||
* @param {string} currentFile - Path to current source properties file (
|
||||
* defaults to `file`). Used to replicate the header
|
||||
* (everything before `module.exports`).
|
||||
*/
|
||||
function writeSourceProperties(sources, file, currentFile) {
|
||||
function writeSourceProperties(sourceProps, file, currentFile) {
|
||||
if (!currentFile) {
|
||||
currentFile = file
|
||||
}
|
||||
// const copies = structuredClone(sources)
|
||||
const copies = sources
|
||||
// const copies = structuredClone(sourceProps)
|
||||
const copies = sourceProps
|
||||
// Define custom inspection for functions
|
||||
function inspectFunction() {
|
||||
const txt = this.toString()
|
||||
@ -77,7 +80,139 @@ function writeSourceProperties(sources, file, currentFile) {
|
||||
fs.writeFileSync(file, `${headerMatch[1]}module.exports = ${txt}\n`)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Build unique source ids from source properties.
|
||||
*
|
||||
* @param {SourceProperties[]} sourceProps - Source properties.
|
||||
* @returns {string[]} Source ids.
|
||||
* @private
|
||||
*/
|
||||
function buildSourceIds(sourceProps) {
|
||||
const ids = sourceProps.map(props =>
|
||||
[props.country, props.state, props.city, props.designation]
|
||||
.filter(Boolean).join(' > ')
|
||||
)
|
||||
// Append scope to duplicate ids if scope not same for all duplicates
|
||||
let duplicates = ids.filter(id => ids.filter(x => x == id).length > 1)
|
||||
duplicates.forEach(duplicate => {
|
||||
// Get scopes for id
|
||||
let scopes = []
|
||||
ids.forEach((id, index) => {
|
||||
if (id == duplicate) {
|
||||
scopes.push(sourceProps[index].scope)
|
||||
}
|
||||
})
|
||||
// If scopes not all equal, append scope to id
|
||||
if (scopes.some(scope => scope != scopes[0])) {
|
||||
ids.forEach((id, index) => {
|
||||
if (id == duplicate) {
|
||||
ids[index] = `${id} > ${sourceProps[index].scope}`
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
// Append integer to duplicate ids
|
||||
duplicates = ids.filter(id => ids.filter(x => x == id).length > 1)
|
||||
duplicates.forEach(duplicate => {
|
||||
let counter = 1
|
||||
ids.forEach((id, index) => {
|
||||
if (id == duplicate) {
|
||||
ids[index] = `${id} > ${counter++}`
|
||||
}
|
||||
})
|
||||
})
|
||||
return ids
|
||||
}
|
||||
|
||||
/**
|
||||
* Load sources from source properties.
|
||||
*
|
||||
* Crosswalks are modified for unit conversions and range parsing.
|
||||
*
|
||||
* @param {string} file - Path to file containing source properties.
|
||||
* @param {object} [filters={}]
|
||||
* @param {string[]} filters.id - Filter by id.
|
||||
* @param {string[]} filters.country - Filter by country.
|
||||
* @param {string[]} filters.state - Filter by state.
|
||||
* @param {string[]} filters.city - Filter by city.
|
||||
* @param {string[]} filters.designation - Filter by designation.
|
||||
* @param {string[]} filters.scope - Filter by scope.
|
||||
* @param {boolean} filters.omit - Whether to include sources flagged as `omit: true`.
|
||||
* @returns {Source[]}
|
||||
*/
|
||||
function loadSources(file, filters = {}) {
|
||||
// Load source properties
|
||||
var sourceProps = readSourceProperties(file)
|
||||
// Assign unique ids
|
||||
const ids = buildSourceIds(sourceProps)
|
||||
sourceProps.forEach((props, index) => {
|
||||
props.id = ids[index]
|
||||
})
|
||||
// Filter source properties
|
||||
if (filters.omit !== true) {
|
||||
sourceProps = sourceProps.filter(props => !props.omit)
|
||||
}
|
||||
const filteredProps = []
|
||||
for (const props of sourceProps) {
|
||||
var include = true
|
||||
for (const key in filters) {
|
||||
if (key == 'omit') {
|
||||
continue
|
||||
}
|
||||
const filter = filters[key]
|
||||
if (!filter || !filter.length) {
|
||||
continue
|
||||
}
|
||||
const value = props[key] ? reduceString(props[key]) : null
|
||||
if (!filter.map(reduceString).includes(value)) {
|
||||
include = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if (include) {
|
||||
filteredProps.push(props)
|
||||
}
|
||||
}
|
||||
// Load sources
|
||||
const sources = []
|
||||
const invalid = []
|
||||
for (const props of filteredProps) {
|
||||
let source
|
||||
if (props.crosswalk) {
|
||||
try {
|
||||
// Modify crosswalk for unit conversions and range parsing
|
||||
props.crosswalk = modifyCrosswalk(props.crosswalk)
|
||||
} catch (error) {
|
||||
const tag = `[${props.id}]`.red
|
||||
console.error(`${tag} ${error.message}`)
|
||||
invalid.push(props.id)
|
||||
continue
|
||||
}
|
||||
}
|
||||
try {
|
||||
// Convert to Source class
|
||||
source = new Source(props)
|
||||
} catch (error) {
|
||||
console.error(error.message)
|
||||
invalid.push(props.id)
|
||||
continue
|
||||
}
|
||||
sources.push(source)
|
||||
}
|
||||
// Report failures
|
||||
if (invalid.length) {
|
||||
console.error(
|
||||
`${'[ERROR]'.red} Skipped ${invalid.length} invalid source(s):\n\n` +
|
||||
invalid.join('\n')
|
||||
)
|
||||
}
|
||||
return sources
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
readSourceProperties,
|
||||
writeSourceProperties
|
||||
writeSourceProperties,
|
||||
loadSources,
|
||||
buildSourceIds
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
const fs = require('fs')
|
||||
const gdal = require('gdal-async')
|
||||
const helpers = require('./lib/helpers')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user