105 lines
2.9 KiB
Plaintext
105 lines
2.9 KiB
Plaintext
_utils_ocrmypdf() {
|
|
if [ "$#" -eq 0 ]; then
|
|
echo "Usage: <in.pdf> <out.pdf>"
|
|
exit 1
|
|
fi
|
|
|
|
# https://stackoverflow.com/questions/7997399/bash-script-to-check-pdfs-are-ocrd
|
|
MYFONTS=$(pdffonts -l 5 "$1" | tail -n +3 | cut -d' ' -f1 | sort | uniq)
|
|
if [ "$MYFONTS" = '' ] || [ "$MYFONTS" = '[none]' ]; then
|
|
echo "NOT OCRed yet. Working..."
|
|
else
|
|
echo "$1 is already OCRed. Saving as is."
|
|
cp "$1" "$2"
|
|
exit 0
|
|
fi
|
|
|
|
in="$1"
|
|
out="$2"
|
|
tmp=$(mktemp -d) || return 1
|
|
|
|
pdfseparate "$in" "$tmp/page-%04d.pdf" || return 1
|
|
i=0
|
|
for page in "$tmp"/page-*.pdf; do
|
|
img="$tmp/img-$i.png"
|
|
qpdf --replace-input --rotate=0:1-z "$page"
|
|
pdftoppm -singlefile -r 300 -png -cropbox "$page" "$tmp/img-$i" || return 1
|
|
# Checks rotations. Annoying way to do it but whatever.
|
|
rotation=$(tesseract "$img" stdout --psm 0 2>/dev/null | awk -F': ' '/Rotate/ {print $2}')
|
|
case "$rotation" in
|
|
180) convert "$img" -rotate 180 "$img" ;;
|
|
90) convert "$img" -rotate 90 "$img" ;;
|
|
270) convert "$img" -rotate 270 "$img";;
|
|
esac
|
|
ocrmypdf \
|
|
--skip-text \
|
|
--clean \
|
|
--optimize 1 \
|
|
--jobs 1 \
|
|
"$img" "$tmp/ocr-$i-tmp.pdf" || return 1
|
|
|
|
case "$rotation" in
|
|
90) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
|
|
270) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
|
|
esac
|
|
mv "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf"
|
|
|
|
i=$((i+1))
|
|
done
|
|
|
|
pdfunite $(ls -v ${tmp}/ocr*.pdf) "$out" || return 1
|
|
}
|
|
|
|
_utils_fix_dashes() {
|
|
if [ "$#" -eq 0 ]; then
|
|
echo "Usage: <input string>"
|
|
exit 1
|
|
fi
|
|
|
|
perl -CSDA -MURI::Escape -MUnicode::Normalize -e '
|
|
binmode STDOUT, ":utf8";
|
|
my $s = shift // "";
|
|
my $prev;
|
|
do { $prev = $s; $s = uri_unescape($s); } while ($s ne $prev);
|
|
$s = NFKC($s);
|
|
$s =~ tr/\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}/-/;
|
|
$s =~ s/&/and/g;
|
|
$s =~ tr/\x{2018}\x{2019}\x{201B}/\x27/;
|
|
$s =~ tr/\x{201C}\x{201D}/"/;
|
|
$s =~ tr/\x{00A0}/ /;
|
|
$s =~ s/[\x{200B}\x{200C}\x{200D}\x{FEFF}]//g;
|
|
$s =~ s/\s+/ /g;
|
|
$s =~ s/^\s+|\s+$//g;
|
|
$s =~ s/\s+(\.[^. ]+)$/$1/;
|
|
print $s;
|
|
' "$1"
|
|
}
|
|
|
|
_utils_download_helper() {
|
|
if [ "$#" -eq 0 ]; then
|
|
echo "Usage: <url> <outfile>"
|
|
exit 1
|
|
fi
|
|
|
|
WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"
|
|
|
|
local url="$1"
|
|
local out="$2"
|
|
local code
|
|
|
|
code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
|
|
case "$code" in
|
|
200)
|
|
echo "Downloaded."
|
|
;;
|
|
304)
|
|
echo "Already exists! Skipping."
|
|
;;
|
|
*)
|
|
echo "FAILED! $code: $out | $url" >&2
|
|
rm -f "$out"
|
|
return 1
|
|
;;
|
|
esac
|
|
}
|