opentrees-harvester/lib/archive.js
2024-09-24 23:11:41 +02:00

309 lines
8.6 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

require('dotenv').config()
const path = require('path')
const fs = require('fs')
const mime = require('mime-types')
const puppeteer = require('puppeteer')
const crypto = require('crypto')
const axios = require('axios')
const util = require('util')
const stream = require('stream')
const streamPipeline = util.promisify(stream.pipeline)
const helpers = require('./helpers')
const {ArchiveEntry} = require('./types')
/**
* Download file and compute MD5 hash of the stream.
*
* @param {string} url URL to download
* @param {string} dir Directory to save file to
* @returns {object|null} File path (file) and md5-base64 checksum (checksum)
*/
async function downloadFile(url, dir = '.') {
try {
const response = await axios.get(url, {responseType: 'stream'})
const filename = guessFilename({headers: response.headers, url: url})
const file = path.join(dir, filename)
fs.mkdirSync(dir, { recursive: true })
const hasher = crypto.createHash('md5')
const base64 = new stream.PassThrough({encoding: 'base64'})
await Promise.all([
streamPipeline(response.data, fs.createWriteStream(file)),
streamPipeline(response.data, base64),
streamPipeline(base64, hasher)
])
const checksum = hasher.digest('hex')
const megabytes = (fs.statSync(file).size / (1024**2)).toFixed(3)
console.log(`Downloaded ${url} to ${file} (${megabytes} MB)`)
return {file, checksum}
} catch (error) {
throw new Error(`Download of ${url} failed: ${error.message}}`)
}
}
/**
* Load URL in browser page.
*
* @param {string} url
* @param {puppeteer.Page} page
* @returns {Promise<puppeteer.HTTPResponse>}
*/
async function loadPage(url, page) {
return await page.goto(url, { waitUntil: 'networkidle0', timeout: 0 })
}
/**
* Read browser page as HTML.
*
* @param {puppeteer.Page} page
* @returns {Promise<string>} HTML
*/
async function readPageHtml(page) {
return await page.content()
}
/**
* Read browser page as MHTML.
*
* @param {puppeteer.Page} page
* @returns {Promise<string>} MHTML
*/
async function readPageMhtml(page) {
const cdp = await page.target().createCDPSession()
const result = await cdp.send('Page.captureSnapshot', { format: 'mhtml' })
return result.data
}
/**
* Read browser page as PNG.
*
* @param {puppeteer.Page} page
* @returns {Promise<Buffer>} PNG
*/
async function readPagePng(page) {
return await page.screenshot({ fullPage: true })
}
/**
* Read browser page as PDF.
*
* @param {puppeteer.Page} page
* @returns {Promise<Buffer>} PDF
*/
async function readPagePdf(page) {
await page.emulateMediaType('screen')
const width = await page.evaluate(
() => document.documentElement.scrollWidth
)
const height = await page.evaluate(
() => document.documentElement.scrollHeight
)
return await page.pdf({
printBackground: true,
// HACK: Add 10% to height to prevent content from bleeding into second page
height: `${Math.ceil(height * 1.10)}px`,
width: `${width}px`,
pageRanges: '1'
})
}
/**
* Compute MD5 hash.
*
* @param {string} x
* @returns {string} MD5 hash
*/
function md5(x) {
return crypto.createHash('md5').update(x).digest('hex')
}
/**
* Compute MD5 hash of a file read as a stream.
*
* Uses base64 encoding by default as it was found to be much faster for large
* binary files and same as UTF-8 for text.
*
* @param {string} file
* @returns {Promise<string>} MD5 hash
*/
async function hashFile(file, options = {encoding: 'base64'}) {
const hash = crypto.createHash('md5')
const stream = fs.createReadStream(file, options)
for await (const chunk of stream) {
hash.update(chunk.toString())
}
return hash.digest('hex')
}
/**
* Build archive path.
*
* @param {string} url Represented with MD5 hash
* @param {string} checksum - File hash. Used as the hash if no url
* @param {Date} date Represented with ISO 8601 string with no colons (:)
* @returns {string}
*/
function buildPath({url, checksum, date = new Date()} = {}) {
const hash = url ? md5(url) : checksum
date = date.toISOString().replace(/:/g, '')
return path.join(process.env.DATA_ARCHIVE, hash, date)
}
/**
* Add an entry to the archive log.
*
* @param {object} params
* @param {Date} params.date
* @returns {Promise<ArchiveEntry>}
*/
async function log({ date = new Date(), ...props } = {}) {
let entry = { date, ...props }
if (entry.path) {
// Check that the path is in the archive
const parent = path.resolve(process.env.DATA_ARCHIVE)
if (!path.resolve(entry.path).startsWith(parent)) {
throw new Error(`Path ${entry.path} is not in ${process.env.DATA_ARCHIVE}`)
}
}
if (entry.path && !entry.checksum) {
entry.checksum = await hashFile(entry.path)
}
// Make path relative to archive
if (entry.path) {
entry.path = path.relative(process.env.DATA_ARCHIVE, entry.path)
}
if (entry.checksum) {
const entries = await search({ checksum: entry.checksum })
if (entries.length > 0) {
// Reuse path of most recent entry with same checksum
const existingEntry = entries.sort(
(a, b) => new Date(b.date) - new Date(a.date)
)[0]
entry.path = existingEntry.path
}
}
fs.appendFileSync(process.env.DATA_ARCHIVE_LOG, JSON.stringify(entry) + '\n')
// Expand path to full path
entry.path = path.join(process.env.DATA_ARCHIVE, entry.path)
return entry
}
/**
* Guess filename from HTTP response headers.
*
* @param {object} headers HTTP response headers
* @param {string} defaultBasename - Basename to use if none is found
* @param {string} url - HTTP request URL
* @returns {string}
*/
function guessFilename({headers = {}, defaultBasename = 'response', url = null} = {}) {
const basenames = []
if (headers['content-disposition']) {
const match = headers['content-disposition'].match(
/filename[^;=\n]*=\s*(UTF-\d['"]*)?((['"]).*?[.]$\2|[^;\n]*)?/i
)
if (match) {
// Remove begin or end quotes
basenames.push(match[2].replace(/^['"]|['"]$/g, ''))
}
}
if (url) {
const u = new URL(url)
const filename = path.basename(u.pathname)
if (path.extname(filename)) {
basenames.push(filename)
}
}
basenames.push(defaultBasename)
ADDITIONAL_MIME_TYPES = {
// https://mimetype.io/application/x-zip-compressed
'application/zip-compressed': 'zip',
'application/x-zip-compressed': 'zip',
'multipart/x-zip': 'zip'
}
const extension = (
mime.extension(headers['content-type']) ||
ADDITIONAL_MIME_TYPES[headers['content-type']]
)
const basename = basenames.filter(x => x)[0].replace(/\.$/, '')
if (extension && path.extname(basename) !== `.${extension}`) {
return `${basename}.${extension}`
}
return basename
}
/**
* Write data to file and add to log.
*
* @param {object} params
* @param {string} params.data
* @param {string} params.filename
* @param {string} params.url
* @param {Date} params.date
* @returns {ArchiveEntry}
*/
function logData({ data, filename, url, date = new Date(), ...props } = {}) {
const dir = buildPath(url, date)
const file = path.join(dir, filename)
// Write file
fs.mkdirSync(dir, { recursive: true })
fs.writeFileSync(file, data)
// Log file
return log({ url, file, date, ...props })
}
/**
* Search log for matching entries.
*
* @param {object} params - Search criteria as key-value pairs that must match
* @param {object} options
* @param {int} [options.limit] - Maximum number of results to return
* @param {int} [options.maxDays] - Maximum age of result in days
* @returns {ArchiveEntry[]} Entries that match search criteria, sorted by date
* descending.
*/
function search(params, {limit, maxDays} = {}) {
let maxDate
if (typeof maxDays === 'number' && isFinite(maxDays)) {
maxDate = new Date()
maxDate.setSeconds(maxDate.getSeconds() - maxDays * 24 * 3600)
}
const criterias = Object.entries(params || {})
const entries = helpers.readJSONL(process.env.DATA_ARCHIVE_LOG)
// Filter entries
const selected = []
for (const entry of entries) {
if (limit && selected.length === limit) {
break
}
if (criterias.map(([key, value]) => entry[key] === value).every(Boolean)) {
if (!maxDate || new Date(entry.date) > maxDate) {
selected.push(entry)
}
}
}
// Sort entries by date descending
selected.sort((a, b) => new Date(b.date) - new Date(a.date))
// Convert paths to full paths
selected.forEach(entry => {
entry.path = path.join(process.env.DATA_ARCHIVE, entry.path)
})
return selected
}
module.exports = {
md5,
loadPage,
readPageHtml,
readPageMhtml,
readPagePng,
readPagePdf,
buildPath,
guessFilename,
log,
logData,
search,
hashFile,
downloadFile
}