LondonScrapers/functions/.functions.utils

_utils_ocrmypdf() {
  if [ "$#" -eq 0 ]; then
    echo "Usage: <in.pdf> <out.pdf>"
    exit 1
  fi

  # https://stackoverflow.com/questions/7997399/bash-script-to-check-pdfs-are-ocrd
  MYFONTS=$(pdffonts -l 5 "$1" | tail -n +3 | cut -d' ' -f1 | sort | uniq)
  if [ "$MYFONTS" = '' ] || [ "$MYFONTS" = '[none]' ]; then
      echo "NOT OCRed yet. Working..."
  else
      echo "$1 is already OCRed. Saving as is."
      cp "$1" "$2"
      exit 0
  fi

  in="$1"
  out="$2"
  tmp=$(mktemp -d) || return 1

  pdfseparate "$in" "$tmp/page-%04d.pdf" || return 1
  i=0
  for page in "$tmp"/page-*.pdf; do
    img="$tmp/img-$i.png"
    qpdf --replace-input --rotate=0:1-z "$page"
    pdftoppm -singlefile -r 300 -png -cropbox "$page" "$tmp/img-$i" || return 1
    # Checks rotations. Annoying way to do it but whatever.
    rotation=$(tesseract "$img" stdout --psm 0 2>/dev/null | awk -F': ' '/Rotate/ {print $2}')
    case "$rotation" in
      180) convert "$img" -rotate 180 "$img" ;;
      90)  convert "$img" -rotate 90  "$img" ;;
      270)  convert "$img" -rotate 270  "$img";;
    esac
    ocrmypdf \
      --skip-text \
      --clean \
      --optimize 1 \
      --jobs 1 \
      "$img" "$tmp/ocr-$i-tmp.pdf" || return 1

    case "$rotation" in
      90)  qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
      270) qpdf "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf" --rotate=270:1-z && rm -f "$tmp/ocr-$i-tmp.pdf" ;;
    esac
    mv "$tmp/ocr-$i-tmp.pdf" "$tmp/ocr-$i.pdf"

    i=$((i+1))
  done

  pdfunite $(ls -v ${tmp}/ocr*.pdf) "$out" || return 1
}

_utils_fix_dashes() {
    if [ "$#" -eq 0 ]; then
      echo "Usage: <input string>"
      exit 1
    fi

    perl -CSDA -MURI::Escape -MUnicode::Normalize -e '
    binmode STDOUT, ":utf8";
    my $s = shift // "";
    my $prev;
    do { $prev = $s; $s = uri_unescape($s); } while ($s ne $prev);
    $s = NFKC($s);
    $s =~ tr/\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}/-/;
    $s =~ s/&/and/g;
    $s =~ tr/\x{2018}\x{2019}\x{201B}/\x27/;
    $s =~ tr/\x{201C}\x{201D}/"/;
    $s =~ tr/\x{00A0}/ /;
    $s =~ s/[\x{200B}\x{200C}\x{200D}\x{FEFF}]//g;
    $s =~ s/\s+/ /g;
    $s =~ s/^\s+|\s+$//g;
    $s =~ s/\s+(\.[^. ]+)$/$1/;
    print $s;
  ' "$1"
}

_utils_download_helper() {
  if [ "$#" -eq 0 ]; then
    echo "Usage: <url> <outfile>"
    exit 1
  fi

  WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87"

  local url="$1"
  local out="$2"
  local code

  code=$(curl -L -k -A "$WGET_UA" -sS -w "%{http_code}" --retry 3 --retry-delay 2 -z "$out" -o "$out" "$url")
  case "$code" in
    200)
      echo "Downloaded."
      ;;
    304)
      echo "Already exists! Skipping."
      ;;
    *)
      echo "FAILED! $code: $out | $url" >&2
      rm -f "$out"
      return 1
      ;;
  esac
}