/** * Data archive workflows. * * @module * @private */ const axios = require('axios') const archive = require('./archive') const helpers = require('./helpers') const wfs = require('./wfs') const fs = require('fs') const os = require('os') const puppeteer = require('puppeteer') const {ArchiveEntry, BrowserFormat} = require('./types') const path = require('path') let BROWSER = null const PAGE_FORMAT_FUNCTIONS = { mhtml: archive.readPageMhtml, html: archive.readPageHtml, png: archive.readPagePng, pdf: archive.readPagePdf, } /** * Get cached browser instance. * * @returns {Promise} */ async function getBrowser() { return BROWSER || await puppeteer.launch({ headless: true, defaultViewport: { width: 1440, height: 900 } }) } /** * Download file from URL and log result. * * @param {object} params - Parameters * @param {string} params.url - URL to download * @param {number} params.maxDays - Maximum age of existing result in days that * would prevent downloading again * @param {object} [params.props] - Additional properties to log * @returns {Promise} Log entry */ async function downloadFile({url, maxDays, props } = {}) { // Check that we have not recently downloaded this URL if (maxDays !== 0) { const existing = archive.search({url}, {maxDays}) if (existing.length > 0) { console.log(`[${url}] Already downloaded`) return true } } // Download file to temporary directory const date = new Date() const tempdir = fs.mkdtempSync(`${os.tmpdir()}${path.sep}`) const {file, checksum} = await archive.downloadFile(url, tempdir) // if (!file || !fs.existsSync(file)) { // console.error(`[${url}] Download failed to ${file}`) // return false // } // Compare checksum const entry = { url, method: 'file', date, checksum } const match = archive.search({ checksum })[0] if (match) { // Reuse path of existing file entry.path = match.path entry.existed = true // Remove temporary directory fs.rmSync(tempdir, { recursive: true }) } else { // Move file to archive path const dir = archive.buildPath({url, date}) fs.mkdirSync(dir, { recursive: true }) entry.path = path.join(dir, path.basename(file)) fs.renameSync(file, entry.path) } // Log file if (props && Object.keys(props).length > 0) { entry.props = props } return archive.log(entry) } /** * Download features from ArcGIS Feature Layer and log result. * * @param {object} params - Parameters * @param {string} params.url - Feature layer URL * @param {number} params.maxDays - Maximum age of existing result in days that * would prevent downloading again * @param {object} [params.props] - Additional properties to log * @returns {Promise} Log entry */ async function downloadArcgisFeatureLayer({url, maxDays, props} = {}) { // Check that we have not recently downloaded this URL if (maxDays !== 0) { const existing = archive.search({url}, {maxDays}) if (existing.length > 0) { console.log(`[${url}] Already downloaded`) return true } } // Get features const date = new Date() const result = await helpers.getLayerFeatures(url) const txt = JSON.stringify(result) const checksum = archive.md5(Buffer.from(txt).toString('base64')) const entry = { url, method: 'arcgis', date, checksum } // Compare checksum const match = archive.search({ checksum })[0] if (match) { // Reuse path of most recent entry with matching checksum entry.path = match.path entry.existed = true } else { // Write file to archive const dir = archive.buildPath({url, date}) fs.mkdirSync(dir, { recursive: true }) entry.path = path.join(dir, 'features.json') const megabytes = (Buffer.byteLength(txt, 'utf8') / (1024**2)).toFixed(3) console.log(`[${url}] Written to ${entry.path} (${megabytes} MB)`) fs.writeFileSync(entry.path, txt, {encoding: 'utf-8'}) } // Log file if (props && Object.keys(props).length > 0) { entry.props = props } return archive.log(entry) } /** * Register existing file in archive log. * * @param {object} params - Parameters * @param {string} params.file - Path to file * @param {string} params.date - Date of file * @param {string} params.url - URL of original file download * @param {string} params.method - Method used to download file from URL * @param {number} params.maxDays - Maximum age of existing result in days that * would prevent downloading again * @param {object} [params.props] - Additional properties to log * @returns {Promise} Log entry */ async function registerFile({file, date, url, method, maxDays, props} = {}) { // Check that we have not recently downloaded this URL if (url && maxDays !== 0) { const existing = archive.search({url}, {maxDays}) if (existing.length > 0) { console.log(`[${url}] Already downloaded`) return true } } // Get date from file creation time if (!date) { const stats = fs.statSync(file) date = new Date(stats.ctime) } // Compare checksum const checksum = await archive.hashFile(file) const entry = { ...url && {url}, ...method && {method}, date, dateAdded: new Date(), checksum } const match = archive.search({ checksum })[0] if (match) { console.log(`[${file}] Already exists`) return true } else { // Move file to archive path const dir = archive.buildPath({ url, checksum, date }) fs.mkdirSync(dir, { recursive: true }) entry.path = path.join(dir, path.basename(file)) fs.renameSync(file, entry.path) } // Log file if (props && Object.keys(props).length > 0) { entry.props = props } return archive.log(entry) } /** * Build WFS GetFeature URL. * * @param {string} url - WFS server URL (ideally with typeName parameter) * @returns {object} URL (url) and server capabilities (capabilities) */ async function buildWfsDownloadUrl(url) { const capabilitiesURL = wfs.buildGetCapabilitiesUrl(url) const response = await axios.get(capabilitiesURL) const capabilities = wfs.parseCapabilities(response.data) const featureURL = wfs.buildGetFeatureUrl(url, capabilities) return {url: featureURL, capabilities} } /** * Download web page as MHTML and log result. * * Page is rendered in a headless browser (puppeteer) and saved as MHTML. * * @param {object} params - Parameters * @param {string} params.url - URL to download * @param {BrowserFormat} params.format - Format to save page as * @param {number} params.maxDays - Maximum age of existing result in days that * would prevent downloading again * @param {object} [params.props] - Additional properties to log * @returns {Promise} Log entry */ async function downloadPage({url, format, maxDays, props} = {}) { const readFunction = PAGE_FORMAT_FUNCTIONS[format] if (!readFunction) { throw new Error(`Unknown format: ${format}`) } // Check that we have not recently downloaded this URL if (maxDays !== 0) { const existing = archive.search({url, format}, {maxDays}) if (existing.length > 0) { return existing[0] } } // Load page in browser const browser = await getBrowser() const page = await browser.newPage() // Set user agent to avoid bot detection // https://stackoverflow.com/a/55684016 await page.setUserAgent( 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Brave Chrome/84.0.4147.89 Safari/537.36' ) const date = new Date() const response = await archive.loadPage(url, page) const status = response.status() if (status >= 300 && status != 304) { await page.close() throw new Error(`[${url}] Failed to open: HTTP ${response.status()}`) } // Save page const data = await readFunction(page) const isBuffer = Buffer.isBuffer(data) await page.close() // Compare checksum const buffer = isBuffer ? data : Buffer.from(data) const checksum = archive.md5(buffer.toString('base64')) const entry = { url, method: 'browser', format, date, checksum } const matching = archive.search({ checksum: entry.checksum }) if (matching.length > 0) { // Reuse path of most recent entry with matching checksum entry.path = matching[0].path entry.existed = true } else { const dir = archive.buildPath({url, date}) fs.mkdirSync(dir, { recursive: true }) entry.path = path.join(dir, `response.${format}`) const megabytes = (buffer.byteLength / (1024**2)).toFixed(3) fs.writeFileSync(entry.path, data, isBuffer ? {} : {encoding: 'utf-8'}) console.log(`[${url}] Written to ${entry.path} (${megabytes} MB)`) } // Log file if (props && Object.keys(props).length > 0) { entry.props = props } return archive.log(entry) } module.exports = { getBrowser, downloadFile, downloadArcgisFeatureLayer, registerFile, buildWfsDownloadUrl, downloadPage, }