LondonScrapers/functions/.functions.utils
2026-06-19 23:30:51 -04:00

105 lines
2.9 KiB
Plaintext

_utils_ocrmypdf() {
if [ "$#" -eq 0 ]; then
echo "Usage: <in.pdf> <out.pdf>"
exit 1
fi
# https://stackoverflow.com/questions/7997399/bash-script-to-check-pdfs-are-ocrd
MYFONTS=$(pdffonts -l 5 "$1" | tail -n +3 | cut -d' ' -f1 | sort | uniq)
if [ "$MYFONTS" = '' ] || [ "$MYFONTS" = '[none]' ]; then
echo "NOT OCRed yet. Working..."
else
echo "$1 is already OCRed. Saving as is."
cp "$1" "$2"
exit 0
fi
in="$1"
out="$2"
tmp=$(mktemp -d) || return 1
pdfseparate "$in" "$tmp/page-%04d.pdf" || return 1
i=0
for page in "$tmp"/page-*.pdf; do
img="$tmp/img-$i.png"
qpdf --replace-input --rotate=0:1-z "$page"
pdftoppm -singlefile -r 300 -png -cropbox "$page" "$tmp/img-$i" || return 1
# Checks rotations. Annoying way to do it but whatever.
rotation=$(tesseract "$img" stdout --psm 0 2>/dev/null | awk -F': ' '/Rotate/ {print $2}')
case "$rotation" in
180) convert "$img" -rotate 180 "$img" ;;
90) convert "$img" -rotate 90 "$img" ;;
270) convert "$img" -rotate 270 "$img";;
esac
ocrmypdf \
--skip-text \
--clean \
--optimize 1 \
--jobs 1 \
"$img" "$tmp/ocr-$i-tmp.pdf" || return 1
case "$rotation" in
90) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
270) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
esac
mv "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf"
i=$((i+1))
done
pdfunite $(ls -v ${tmp}/ocr*.pdf) "$out" || return 1
}
_utils_fix_dashes() {
if [ "$#" -eq 0 ]; then
echo "Usage: <input string>"
exit 1
fi
perl -CSDA -MURI::Escape -MUnicode::Normalize -e '
binmode STDOUT, ":utf8";
my $s = shift // "";
my $prev;
do { $prev = $s; $s = uri_unescape($s); } while ($s ne $prev);
$s = NFKC($s);
$s =~ tr/\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}/-/;
$s =~ s/&/and/g;
$s =~ tr/\x{2018}\x{2019}\x{201B}/\x27/;
$s =~ tr/\x{201C}\x{201D}/"/;
$s =~ tr/\x{00A0}/ /;
$s =~ s/[\x{200B}\x{200C}\x{200D}\x{FEFF}]//g;
$s =~ s/\s+/ /g;
$s =~ s/^\s+|\s+$//g;
$s =~ s/\s+(\.[^. ]+)$/$1/;
print $s;
' "$1"
}
_utils_download_helper() {
if [ "$#" -eq 0 ]; then
echo "Usage: <url> <outfile>"
exit 1
fi
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
local url="$1"
local out="$2"
local code
code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
case "$code" in
200)
echo "Downloaded."
;;
304)
echo "Already exists! Skipping."
;;
*)
echo "FAILED! $code: $out | $url" >&2
rm -f "$out"
return 1
;;
esac
}