mirror of
https://github.com/ezwelty/opentrees-harvester.git
synced 2025-06-18 18:55:42 -04:00
309 lines
8.6 KiB
JavaScript
309 lines
8.6 KiB
JavaScript
require('dotenv').config()
|
||
const path = require('path')
|
||
const fs = require('fs')
|
||
const mime = require('mime-types')
|
||
const puppeteer = require('puppeteer')
|
||
const crypto = require('crypto')
|
||
const axios = require('axios')
|
||
const util = require('util')
|
||
const stream = require('stream')
|
||
const streamPipeline = util.promisify(stream.pipeline)
|
||
|
||
const helpers = require('./helpers')
|
||
const {ArchiveEntry} = require('./types')
|
||
|
||
/**
|
||
* Download file and compute MD5 hash of the stream.
|
||
*
|
||
* @param {string} url – URL to download
|
||
* @param {string} dir – Directory to save file to
|
||
* @returns {object|null} File path (file) and md5-base64 checksum (checksum)
|
||
*/
|
||
async function downloadFile(url, dir = '.') {
|
||
try {
|
||
const response = await axios.get(url, {responseType: 'stream'})
|
||
const filename = guessFilename({headers: response.headers, url: url})
|
||
const file = path.join(dir, filename)
|
||
fs.mkdirSync(dir, { recursive: true })
|
||
const hasher = crypto.createHash('md5')
|
||
const base64 = new stream.PassThrough({encoding: 'base64'})
|
||
await Promise.all([
|
||
streamPipeline(response.data, fs.createWriteStream(file)),
|
||
streamPipeline(response.data, base64),
|
||
streamPipeline(base64, hasher)
|
||
])
|
||
const checksum = hasher.digest('hex')
|
||
const megabytes = (fs.statSync(file).size / (1024**2)).toFixed(3)
|
||
console.log(`Downloaded ${url} to ${file} (${megabytes} MB)`)
|
||
return {file, checksum}
|
||
} catch (error) {
|
||
throw new Error(`Download of ${url} failed: ${error.message}}`)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Load URL in browser page.
|
||
*
|
||
* @param {string} url
|
||
* @param {puppeteer.Page} page
|
||
* @returns {Promise<puppeteer.HTTPResponse>}
|
||
*/
|
||
async function loadPage(url, page) {
|
||
return await page.goto(url, { waitUntil: 'networkidle0', timeout: 0 })
|
||
}
|
||
|
||
/**
|
||
* Read browser page as HTML.
|
||
*
|
||
* @param {puppeteer.Page} page
|
||
* @returns {Promise<string>} HTML
|
||
*/
|
||
async function readPageHtml(page) {
|
||
return await page.content()
|
||
}
|
||
|
||
/**
|
||
* Read browser page as MHTML.
|
||
*
|
||
* @param {puppeteer.Page} page
|
||
* @returns {Promise<string>} MHTML
|
||
*/
|
||
async function readPageMhtml(page) {
|
||
const cdp = await page.target().createCDPSession()
|
||
const result = await cdp.send('Page.captureSnapshot', { format: 'mhtml' })
|
||
return result.data
|
||
}
|
||
|
||
/**
|
||
* Read browser page as PNG.
|
||
*
|
||
* @param {puppeteer.Page} page
|
||
* @returns {Promise<Buffer>} PNG
|
||
*/
|
||
async function readPagePng(page) {
|
||
return await page.screenshot({ fullPage: true })
|
||
}
|
||
|
||
/**
|
||
* Read browser page as PDF.
|
||
*
|
||
* @param {puppeteer.Page} page
|
||
* @returns {Promise<Buffer>} PDF
|
||
*/
|
||
async function readPagePdf(page) {
|
||
await page.emulateMediaType('screen')
|
||
const width = await page.evaluate(
|
||
() => document.documentElement.scrollWidth
|
||
)
|
||
const height = await page.evaluate(
|
||
() => document.documentElement.scrollHeight
|
||
)
|
||
return await page.pdf({
|
||
printBackground: true,
|
||
// HACK: Add 10% to height to prevent content from bleeding into second page
|
||
height: `${Math.ceil(height * 1.10)}px`,
|
||
width: `${width}px`,
|
||
pageRanges: '1'
|
||
})
|
||
}
|
||
|
||
/**
|
||
* Compute MD5 hash.
|
||
*
|
||
* @param {string} x
|
||
* @returns {string} MD5 hash
|
||
*/
|
||
function md5(x) {
|
||
return crypto.createHash('md5').update(x).digest('hex')
|
||
}
|
||
|
||
/**
|
||
* Compute MD5 hash of a file read as a stream.
|
||
*
|
||
* Uses base64 encoding by default as it was found to be much faster for large
|
||
* binary files and same as UTF-8 for text.
|
||
*
|
||
* @param {string} file
|
||
* @returns {Promise<string>} MD5 hash
|
||
*/
|
||
async function hashFile(file, options = {encoding: 'base64'}) {
|
||
const hash = crypto.createHash('md5')
|
||
const stream = fs.createReadStream(file, options)
|
||
for await (const chunk of stream) {
|
||
hash.update(chunk.toString())
|
||
}
|
||
return hash.digest('hex')
|
||
}
|
||
|
||
/**
|
||
* Build archive path.
|
||
*
|
||
* @param {string} url – Represented with MD5 hash
|
||
* @param {string} checksum - File hash. Used as the hash if no url
|
||
* @param {Date} date – Represented with ISO 8601 string with no colons (:)
|
||
* @returns {string}
|
||
*/
|
||
function buildPath({url, checksum, date = new Date()} = {}) {
|
||
const hash = url ? md5(url) : checksum
|
||
date = date.toISOString().replace(/:/g, '')
|
||
return path.join(process.env.DATA_ARCHIVE, hash, date)
|
||
}
|
||
|
||
/**
|
||
* Add an entry to the archive log.
|
||
*
|
||
* @param {object} params
|
||
* @param {Date} params.date
|
||
* @returns {Promise<ArchiveEntry>}
|
||
*/
|
||
async function log({ date = new Date(), ...props } = {}) {
|
||
let entry = { date, ...props }
|
||
if (entry.path) {
|
||
// Check that the path is in the archive
|
||
const parent = path.resolve(process.env.DATA_ARCHIVE)
|
||
if (!path.resolve(entry.path).startsWith(parent)) {
|
||
throw new Error(`Path ${entry.path} is not in ${process.env.DATA_ARCHIVE}`)
|
||
}
|
||
}
|
||
if (entry.path && !entry.checksum) {
|
||
entry.checksum = await hashFile(entry.path)
|
||
}
|
||
// Make path relative to archive
|
||
if (entry.path) {
|
||
entry.path = path.relative(process.env.DATA_ARCHIVE, entry.path)
|
||
}
|
||
if (entry.checksum) {
|
||
const entries = await search({ checksum: entry.checksum })
|
||
if (entries.length > 0) {
|
||
// Reuse path of most recent entry with same checksum
|
||
const existingEntry = entries.sort(
|
||
(a, b) => new Date(b.date) - new Date(a.date)
|
||
)[0]
|
||
entry.path = existingEntry.path
|
||
}
|
||
}
|
||
fs.appendFileSync(process.env.DATA_ARCHIVE_LOG, JSON.stringify(entry) + '\n')
|
||
// Expand path to full path
|
||
entry.path = path.join(process.env.DATA_ARCHIVE, entry.path)
|
||
return entry
|
||
}
|
||
|
||
/**
|
||
* Guess filename from HTTP response headers.
|
||
*
|
||
* @param {object} headers – HTTP response headers
|
||
* @param {string} defaultBasename - Basename to use if none is found
|
||
* @param {string} url - HTTP request URL
|
||
* @returns {string}
|
||
*/
|
||
function guessFilename({headers = {}, defaultBasename = 'response', url = null} = {}) {
|
||
const basenames = []
|
||
if (headers['content-disposition']) {
|
||
const match = headers['content-disposition'].match(
|
||
/filename[^;=\n]*=\s*(UTF-\d['"]*)?((['"]).*?[.]$\2|[^;\n]*)?/i
|
||
)
|
||
if (match) {
|
||
// Remove begin or end quotes
|
||
basenames.push(match[2].replace(/^['"]|['"]$/g, ''))
|
||
}
|
||
}
|
||
if (url) {
|
||
const u = new URL(url)
|
||
const filename = path.basename(u.pathname)
|
||
if (path.extname(filename)) {
|
||
basenames.push(filename)
|
||
}
|
||
}
|
||
basenames.push(defaultBasename)
|
||
ADDITIONAL_MIME_TYPES = {
|
||
// https://mimetype.io/application/x-zip-compressed
|
||
'application/zip-compressed': 'zip',
|
||
'application/x-zip-compressed': 'zip',
|
||
'multipart/x-zip': 'zip'
|
||
}
|
||
const extension = (
|
||
mime.extension(headers['content-type']) ||
|
||
ADDITIONAL_MIME_TYPES[headers['content-type']]
|
||
)
|
||
const basename = basenames.filter(x => x)[0].replace(/\.$/, '')
|
||
if (extension && path.extname(basename) !== `.${extension}`) {
|
||
return `${basename}.${extension}`
|
||
}
|
||
return basename
|
||
}
|
||
|
||
/**
|
||
* Write data to file and add to log.
|
||
*
|
||
* @param {object} params
|
||
* @param {string} params.data
|
||
* @param {string} params.filename
|
||
* @param {string} params.url
|
||
* @param {Date} params.date
|
||
* @returns {ArchiveEntry}
|
||
*/
|
||
function logData({ data, filename, url, date = new Date(), ...props } = {}) {
|
||
const dir = buildPath(url, date)
|
||
const file = path.join(dir, filename)
|
||
// Write file
|
||
fs.mkdirSync(dir, { recursive: true })
|
||
fs.writeFileSync(file, data)
|
||
// Log file
|
||
return log({ url, file, date, ...props })
|
||
}
|
||
|
||
/**
|
||
* Search log for matching entries.
|
||
*
|
||
* @param {object} params - Search criteria as key-value pairs that must match
|
||
* @param {object} options
|
||
* @param {int} [options.limit] - Maximum number of results to return
|
||
* @param {int} [options.maxDays] - Maximum age of result in days
|
||
* @returns {ArchiveEntry[]} Entries that match search criteria, sorted by date
|
||
* descending.
|
||
*/
|
||
function search(params, {limit, maxDays} = {}) {
|
||
let maxDate
|
||
if (typeof maxDays === 'number' && isFinite(maxDays)) {
|
||
maxDate = new Date()
|
||
maxDate.setSeconds(maxDate.getSeconds() - maxDays * 24 * 3600)
|
||
}
|
||
const criterias = Object.entries(params || {})
|
||
const entries = helpers.readJSONL(process.env.DATA_ARCHIVE_LOG)
|
||
// Filter entries
|
||
const selected = []
|
||
for (const entry of entries) {
|
||
if (limit && selected.length === limit) {
|
||
break
|
||
}
|
||
if (criterias.map(([key, value]) => entry[key] === value).every(Boolean)) {
|
||
if (!maxDate || new Date(entry.date) > maxDate) {
|
||
selected.push(entry)
|
||
}
|
||
}
|
||
}
|
||
// Sort entries by date descending
|
||
selected.sort((a, b) => new Date(b.date) - new Date(a.date))
|
||
// Convert paths to full paths
|
||
selected.forEach(entry => {
|
||
entry.path = path.join(process.env.DATA_ARCHIVE, entry.path)
|
||
})
|
||
return selected
|
||
}
|
||
|
||
module.exports = {
|
||
md5,
|
||
loadPage,
|
||
readPageHtml,
|
||
readPageMhtml,
|
||
readPagePng,
|
||
readPagePdf,
|
||
buildPath,
|
||
guessFilename,
|
||
log,
|
||
logData,
|
||
search,
|
||
hashFile,
|
||
downloadFile
|
||
}
|