Log data archive paths as relative to archive path

This commit is contained in:
ezwelty 2024-02-21 18:09:50 +01:00 committed by Ethan Welty
parent 97c78f287f
commit 0b5dc54e90
3 changed files with 20 additions and 7 deletions

View File

@ -1,4 +1,4 @@
GOOGLE_MAPS_API_KEY=
DATA_ARCHIVE=archive
DATA_ARCHIVE_LOG=archive.jsonl
GEOCODE_ARCHIVE=geocode
DATA_ARCHIVE=archive/data
DATA_ARCHIVE_LOG=archive/data.jsonl
GEOCODE_ARCHIVE=archive/geocode

2
.gitignore vendored
View File

@ -15,6 +15,4 @@
/out/
/taxa*/
archive
archive.jsonl
geocode
.env

View File

@ -156,9 +156,20 @@ function buildPath({url, checksum, date = new Date()} = {}) {
*/
async function log({ date = new Date(), ...props } = {}) {
let entry = { date, ...props }
if (entry.path) {
// Check that the path is in the archive
const parent = path.resolve(process.env.DATA_ARCHIVE)
if (!path.resolve(entry.path).startsWith(parent)) {
throw new Error(`Path ${entry.path} is not in ${process.env.DATA_ARCHIVE}`)
}
}
if (entry.path && !entry.checksum) {
entry.checksum = await hashFile(entry.path)
}
// Make path relative to archive
if (entry.path) {
entry.path = path.relative(process.env.DATA_ARCHIVE, entry.path)
}
if (entry.checksum) {
const entries = await search({ checksum: entry.checksum })
if (entries.length > 0) {
@ -231,7 +242,7 @@ function logData({ data, filename, url, date = new Date(), ...props } = {}) {
const dir = buildPath(url, date)
const file = path.join(dir, filename)
// Write file
fs.mkdirSync(file, { recursive: true })
fs.mkdirSync(dir, { recursive: true })
fs.writeFileSync(file, data)
// Log file
return log({ url, file, date, ...props })
@ -257,7 +268,7 @@ async function search(params, {limit, maxDays} = {}) {
const entries = []
for await (const log of helpers.iterateJSONL(process.env.DATA_ARCHIVE_LOG)) {
if (limit && entries.length === limit) {
return entries
break
}
if (criterias.map(([key, value]) => log[key] === value).every(Boolean)) {
if (!maxDate || new Date(log.date) > maxDate) {
@ -267,6 +278,10 @@ async function search(params, {limit, maxDays} = {}) {
}
// Sort entries by date descending
entries.sort((a, b) => new Date(b.date) - new Date(a.date))
// Convert paths to full paths
entries.forEach(entry => {
entry.path = path.join(process.env.DATA_ARCHIVE, entry.path)
})
return entries
}