Python 3 Script to Extract All Images From PDF Document in Command Line
import sys pdf = open(sys.argv[1], "rb").read()minimum_seek = 20startfix = 0endfix = 2i = 0 formats = { "jpeg": { "start": b'\xff\xd8', "end": b'\xff\xd9' }} filenumber = 0while True: istream = pdf.find(b'stream', i) if istream < 0: break print(istream) iend = pdf.find(b'endstream', istream) if iend < 0: raise Exception("Didn't find end of stream!") istart = pdf.find(formats["jpeg"]["start"], istream, istream + minimum_seek) if istart < 0: iend = pdf.find(b'endstream', istart) data = pdf[istream:iend] datafile = open("data%d" % filenumber, "wb") datafile.write(data) datafile.close() i = istream + minimum_seek filenumber += 1 continue iend = pdf.find(formats["jpeg"]["end"], iend - minimum_seek) if iend < 0: raise Exception("Didn't find end of JPG!") istart += startfix iend += endfix print("JPG %d from %d to %d" % (filenumber, istart, iend)) jpg = pdf[istart:iend] jpgfile = open("jpg%d.jpg" % filenumber, "wb") jpgfile.write(jpg) jpgfile.close() filenumber += 1 i = iend