_utils_ocrmypdf() { if [ "$#" -eq 0 ]; then echo "Usage: " exit 1 fi # https://stackoverflow.com/questions/7997399/bash-script-to-check-pdfs-are-ocrd MYFONTS=$(pdffonts -l 5 "$1" | tail -n +3 | cut -d' ' -f1 | sort | uniq) if [ "$MYFONTS" = '' ] || [ "$MYFONTS" = '[none]' ]; then echo "NOT OCRed yet. Working..." else echo "$1 is already OCRed. Saving as is." cp "$1" "$2" exit 0 fi in="$1" out="$2" tmp=$(mktemp -d) || return 1 pdfseparate "$in" "$tmp/page-%04d.pdf" || return 1 i=0 for page in "$tmp"/page-*.pdf; do img="$tmp/img-$i.png" qpdf --replace-input --rotate=0:1-z "$page" pdftoppm -singlefile -r 300 -png -cropbox "$page" "$tmp/img-$i" || return 1 # Checks rotations. Annoying way to do it but whatever. rotation=$(tesseract "$img" stdout --psm 0 2>/dev/null | awk -F': ' '/Rotate/ {print $2}') case "$rotation" in 180) convert "$img" -rotate 180 "$img" ;; 90) convert "$img" -rotate 90 "$img" ;; 270) convert "$img" -rotate 270 "$img";; esac ocrmypdf \ --skip-text \ --clean \ --optimize 1 \ --jobs 1 \ "$img" "$tmp/ocr-$i-tmp.pdf" || return 1 case "$rotation" in 90) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;; 270) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;; esac mv "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" i=$((i+1)) done pdfunite $(ls -v ${tmp}/ocr*.pdf) "$out" || return 1 } _utils_fix_dashes() { if [ "$#" -eq 0 ]; then echo "Usage: " exit 1 fi perl -CSDA -MURI::Escape -MUnicode::Normalize -e ' binmode STDOUT, ":utf8"; my $s = shift // ""; my $prev; do { $prev = $s; $s = uri_unescape($s); } while ($s ne $prev); $s = NFKC($s); $s =~ tr/\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}/-/; $s =~ s/&/and/g; $s =~ tr/\x{2018}\x{2019}\x{201B}/\x27/; $s =~ tr/\x{201C}\x{201D}/"/; $s =~ tr/\x{00A0}/ /; $s =~ s/[\x{200B}\x{200C}\x{200D}\x{FEFF}]//g; $s =~ s/\s+/ /g; $s =~ s/^\s+|\s+$//g; $s =~ s/\s+(\.[^. ]+)$/$1/; print $s; ' "$1" } _utils_download_helper() { if [ "$#" -eq 0 ]; then echo "Usage: " exit 1 fi WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" local url="$1" local out="$2" local code code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url") case "$code" in 200) echo "Downloaded." ;; 304) echo "Already exists! Skipping." ;; *) echo "FAILED! $code: $out | $url" >&2 rm -f "$out" return 1 ;; esac }