#!/bin/bash -u # $0 [filename.pdf...] # Convert Microsoft PDF file to Unicode "plain" text file. # Converts *.pdf if no arguments given. # Uses bash $'...' quoting for Unicode characters. # -Ian! D. Allen - idallen@idallen.ca - www.idallen.com PATH=/bin:/usr/bin ; export PATH umask 022 # if no arguments, use *.pdf for input file(s), if any if [ $# = 0 ] ; then shopt -s nullglob set -- *.pdf fi rm -f .tmp$$ for f do # Fix some PDF characters to be proper Unicode (needs bash for $'...') # Some fiddling was done to pick characters that worked in both Firefox # and in the urxvt emulator and tmux terminal screen-256color-bce # big black circle: \u25cf # bullet: \u2022 # small bullet: \u2027 # triangle: \uy2023 pdftotext -nopgbrk -layout "$f" - \ | sed -e $'y/\uf0a8\uf020\uf0b7\uf0d8\uf07d\uf06c\uf096\uf06e\uf0e0/\u2022\u2022\u2022\u2022\u2022\u2022\u2027\u2027\u2192/' \ | cat -s >.tmp$$ || exit $? out=${f%.pdf}.txt mv .tmp$$ "$out"