Files
rdkit/Python/sping/PDF/pdfdoc.py
Greg Landrum 75a79b6327 initial import
2006-05-06 22:20:08 +00:00

562 lines
18 KiB
Python
Executable File

#pdfdoc.py
"""
PDFgen is a library to generate PDF files containing text and graphics. It is the
foundation for a complete reporting solution in Python.
The module pdfdoc.py handles the 'outer structure' of PDF documents, ensuring that
all objects are properly cross-referenced and indexed to the nearest byte. The
'inner structure' - the page descriptions - are presumed to be generated before
each page is saved.
pdfgen.py calls this and provides a 'canvas' object to handle page marking operators.
piddlePDF calls pdfgen and offers a high-level interface.
(C) Copyright Andy Robinson 1998-1999
"""
import os
import sys
import string
import time
import tempfile
import cStringIO
from types import *
from math import sin, cos, pi, ceil
try:
import zlib
except:
print "zlib not available, page compression not available"
from pdfgeom import bezierArc
import pdfutils
from pdfutils import LINEEND # this constant needed in both
import pdfmetrics
##############################################################
#
# Constants and declarations
#
##############################################################
StandardEnglishFonts = [
'Courier', 'Courier-Bold', 'Courier-Oblique', 'Courier-BoldOblique',
'Helvetica', 'Helvetica-Bold', 'Helvetica-Oblique',
'Helvetica-BoldOblique',
'Times-Roman', 'Times-Bold', 'Times-Italic', 'Times-BoldItalic',
'Symbol','ZapfDingbats']
PDFError = 'PDFError'
AFMDIR = '.'
A4 = (595.27,841.89) #default page size
class PDFDocument:
"""Responsible for linking and writing out the whole document.
Builds up a list of objects using add(key, object). Each of these
must inherit from PDFObject and be able to write itself into the file.
For cross-linking, it provides getPosition(key) which tells you where
another object is, or raises a KeyError if not found. The rule is that
objects should only refer ones previously written to file.
"""
def __init__(self):
self.objects = []
self.objectPositions = {}
self.fonts = MakeType1Fonts()
#mapping of Postscriptfont names to internal ones;
#needs to be dynamically built once we start adding
#fonts in.
self.fontMapping = {}
for i in range(len(StandardEnglishFonts)):
psname = StandardEnglishFonts[i]
pdfname = '/F%d' % (i+1)
self.fontMapping[psname] = pdfname
self.pages = []
self.pagepositions = []
# position 1
cat = PDFCatalog()
cat.RefPages = 3
cat.RefOutlines = 2
self.add('Catalog', cat)
# position 2 - outlines
outl = PDFOutline()
self.add('Outline', outl)
# position 3 - pages collection
self.PageCol = PDFPageCollection()
self.add('PagesTreeRoot',self.PageCol)
# positions 4-16 - fonts
fontstartpos = len(self.objects) + 1
for font in self.fonts:
self.add('Font.'+font.keyname, font)
self.fontdict = MakeFontDictionary(fontstartpos, len(self.fonts))
# position 17 - Info
self.info = PDFInfo() #hang onto it!
self.add('Info', self.info)
self.infopos = len(self.objects) #1-based, this gives its position
def add(self, key, obj):
self.objectPositions[key] = len(self.objects) # its position
self.objects.append(obj)
obj.doc = self
return len(self.objects) - 1 # give its position
def getPosition(self, key):
"""Tell you where the given object is in the file - used for
cross-linking; an object can call self.doc.getPosition("Page001")
to find out where the object keyed under "Page001" is stored."""
return self.objectPositions[key]
def setTitle(self, title):
"embeds in PDF file"
self.info.title = title
def setAuthor(self, author):
"embedded in PDF file"
self.info.author = author
def setSubject(self, subject):
"embeds in PDF file"
self.info.subject = subject
def printXref(self):
self.startxref = sys.stdout.tell()
print 'xref'
print 0,len(self.objects) + 1
print '0000000000 65535 f'
for pos in self.xref:
print '%0.10d 00000 n' % pos
def writeXref(self, f):
self.startxref = f.tell()
f.write('xref' + LINEEND)
f.write('0 %d' % (len(self.objects) + 1) + LINEEND)
f.write('0000000000 65535 f' + LINEEND)
for pos in self.xref:
f.write('%0.10d 00000 n' % pos + LINEEND)
def printTrailer(self):
print 'trailer'
print '<< /Size %d /Root %d 0 R /Info %d 0 R>>' % (len(self.objects) + 1, 1, self.infopos)
print 'startxref'
print self.startxref
def writeTrailer(self, f):
f.write('trailer' + LINEEND)
f.write('<< /Size %d /Root %d 0 R /Info %d 0 R>>' % (len(self.objects) + 1, 1, self.infopos) + LINEEND)
f.write('startxref' + LINEEND)
f.write(str(self.startxref) + LINEEND)
def SaveToFile(self, filename):
fileobj = open(filename, 'wb')
self.SaveToFileObject(fileobj)
fileobj.close()
def SaveToFileObject(self, fileobj):
"""Open a file, and ask each object in turn to write itself to
the file. Keep track of the file position at each point for
use in the index at the end"""
f = fileobj
i = 1
self.xref = []
f.write("%PDF-1.2" + LINEEND) # for CID support
f.write("%íì¶¾" + LINEEND)
for obj in self.objects:
pos = f.tell()
self.xref.append(pos)
f.write(str(i) + ' 0 obj' + LINEEND)
obj.save(f)
f.write('endobj' + LINEEND)
i = i + 1
self.writeXref(f)
self.writeTrailer(f)
f.write('%%EOF') # no lineend needed on this one!
# with the Mac, we need to tag the file in a special
#way so the system knows it is a PDF file.
#This supplied by Joe Strout
if os.name == 'mac':
import macfs
try:
macfs.FSSpec(filename).SetCreatorType('CARO','PDF ')
except:
pass
def printPDF(self):
"prints it to standard output. Logs positions for doing trailer"
print "%PDF-1.0"
print "%íì¶¾"
i = 1
self.xref = []
for obj in self.objects:
pos = sys.stdout.tell()
self.xref.append(pos)
print i, '0 obj'
obj.printPDF()
print 'endobj'
i = i + 1
self.printXref()
self.printTrailer()
print "%%EOF",
def addPage(self, page):
"""adds page and stream at end. Maintains pages list"""
#page.buildstream()
pos = len(self.objects) # work out where added
page.ParentPos = 3 #pages collection
page.info = {
'parentpos':3,
'fontdict':self.fontdict,
'contentspos':pos + 2,
}
self.PageCol.PageList.append(pos+1)
self.add('Page%06d'% len(self.PageCol.PageList), page)
#self.objects.append(page)
self.add('PageStream%06d'% len(self.PageCol.PageList), page.stream)
#self.objects.append(page.stream)
def hasFont(self, psfontname):
return self.fontMapping.has_key(psfontname)
def getInternalFontName(self, psfontname):
try:
return self.fontMapping[psfontname]
except:
raise PDFError, "Font %s not available in document" % psfontname
def getAvailableFonts(self):
fontnames = self.fontMapping.keys()
fontnames.sort()
return fontnames
##############################################################
#
# Utilities
#
##############################################################
class OutputGrabber:
"""At times we need to put something in the place of standard
output. This grabs stdout, keeps the data, and releases stdout
when done.
NOT working well enough!"""
def __init__(self):
self.oldoutput = sys.stdout
sys.stdout = self
self.closed = 0
self.data = []
def write(self, x):
if not self.closed:
self.data.append(x)
def getData(self):
return string.join(self.data)
def close(self):
sys.stdout = self.oldoutput
self.closed = 1
def __del__(self):
if not self.closed:
self.close()
def testOutputGrabber():
gr = OutputGrabber()
for i in range(10):
print 'line',i
data = gr.getData()
gr.close()
print 'Data...',data
##############################################################
#
# PDF Object Hierarchy
#
##############################################################
class PDFObject:
"""Base class for all PDF objects. In PDF, precise measurement
of file offsets is essential, so the usual trick of just printing
and redirecting output has proved to give different behaviour on
Mac and Windows. While it might be soluble, I'm taking charge
of line ends at the binary level and explicitly writing to a file.
The LINEEND constant lets me try CR, LF and CRLF easily to help
pin down the problem."""
def save(self, file):
"Save its content to an open file"
file.write('% base PDF object' + LINEEND)
def printPDF(self):
self.save(sys.stdout)
class PDFLiteral(PDFObject):
" a ready-made one you wish to quote"
def __init__(self, text):
self.text = text
def save(self, file):
file.write(self.text + LINEEND)
class PDFCatalog(PDFObject):
"requires RefPages and RefOutlines set"
def __init__(self):
self.template = string.join([
'<<',
'/Type /Catalog',
'/Pages %d 0 R',
'/Outlines %d 0 R',
'>>'
],LINEEND
)
def save(self, file):
file.write(self.template % (self.RefPages, self.RefOutlines) + LINEEND)
class PDFInfo(PDFObject):
"""PDF documents can have basic information embedded, viewable from
File | Document Info in Acrobat Reader. If this is wrong, you get
Postscript errors while printing, even though it does not print."""
def __init__(self):
self.title = "untitled"
self.author = "anonymous"
self.subject = "unspecified"
now = time.localtime(time.time())
self.datestr = '%04d%02d%02d%02d%02d%02d' % tuple(now[0:6])
def save(self, file):
file.write(string.join([
"<</Title (%s)",
"/Author (%s)",
"/CreationDate (D:%s)",
"/Producer (PDFgen)",
"/Subject (%s)",
">>"
], LINEEND
) % (
pdfutils._escape(self.title),
pdfutils._escape(self.author),
self.datestr,
pdfutils._escape(self.subject)
) + LINEEND)
class PDFOutline(PDFObject):
"null outline, does nothing yet"
def __init__(self):
self.template = string.join([
'<<',
'/Type /Outlines',
'/Count 0',
'>>'],
LINEEND)
def save(self, file):
file.write(self.template + LINEEND)
class PDFPageCollection(PDFObject):
"presumes PageList attribute set (list of integers)"
def __init__(self):
self.PageList = []
def save(self, file):
lines = [ '<<',
'/Type /Pages',
'/Count %d' % len(self.PageList),
'/Kids ['
]
for page in self.PageList:
lines.append(str(page) + ' 0 R ')
lines.append(']')
lines.append('>>')
text = string.join(lines, LINEEND)
file.write(text + LINEEND)
class PDFPage(PDFObject):
"""The Bastard. Needs list of Resources etc. Use a standard one for now.
It manages a PDFStream object which must be added to the document's list
of objects as well."""
def __init__(self):
self.drawables = []
self.pagewidth = 595 #these are overridden by piddlePDF
self.pageheight = 842
self.stream = PDFStream()
self.hasImages = 0
self.pageTransitionString = '' # presentation effects
# editors on different systems may put different things in the line end
# without me noticing. No triple-quoted strings allowed!
self.template = string.join([
'<<',
'/Type /Page',
'/Parent %(parentpos)d 0 R',
'/Resources',
' <<',
' /Font %(fontdict)s',
' /ProcSet %(procsettext)s',
' >>',
'/MediaBox [0 0 %(pagewidth)d %(pageheight)d]', #A4 by default
'/Contents %(contentspos)d 0 R',
'%(transitionString)s',
'>>'],
LINEEND)
def setCompression(self, onoff=0):
"Turns page compression on or off"
assert onoff in [0,1], "Page compression options are 1=on, 2=off"
self.stream.compression = onoff
def save(self, file):
self.info['pagewidth'] = self.pagewidth
self.info['pageheight'] = self.pageheight
# check for image support
if self.hasImages:
self.info['procsettext'] = '[/PDF /Text /ImageC]'
else:
self.info['procsettext'] = '[/PDF /Text]'
self.info['transitionString'] = self.pageTransitionString
file.write(self.template % self.info + LINEEND)
def clear(self):
self.drawables = []
def setStream(self, data):
if type(data) is ListType:
data = string.join(data, LINEEND)
self.stream.setStream(data)
TestStream = "BT /F6 24 Tf 80 672 Td 24 TL ( ) Tj T* ET"
class PDFStream(PDFObject):
"Used for the contents of a page"
def __init__(self):
self.data = None
self.compression = 0
def setStream(self, data):
self.data = data
def save(self, file):
#avoid crashes if they wrote nothing in the page
if self.data == None:
self.data = TestStream
if self.compression == 1:
comp = zlib.compress(self.data) #this bit is very fast...
base85 = pdfutils._AsciiBase85Encode(comp) #...sadly this isn't
wrapped = pdfutils._wrap(base85)
data_to_write = wrapped
else:
data_to_write = self.data
# the PDF length key should contain the length including
# any extra LF pairs added by Print on DOS.
#lines = len(string.split(self.data,'\n'))
#length = len(self.data) + lines # one extra LF each
length = len(data_to_write) + len(LINEEND) #AR 19980202
if self.compression:
file.write('<< /Length %d /Filter [/ASCII85Decode /FlateDecode]>>' % length + LINEEND)
else:
file.write('<< /Length %d >>' % length + LINEEND)
file.write('stream' + LINEEND)
file.write(data_to_write + LINEEND)
file.write('endstream' + LINEEND)
class PDFImage(PDFObject):
# sample one while developing. Currently, images go in a literals
def save(self, file):
file.write(string.join([
'<<',
'/Type /XObject',
'/Subtype /Image',
'/Name /Im0',
'/Width 24',
'/Height 23',
'/BitsPerComponent 1',
'/ColorSpace /DeviceGray',
'/Filter /ASCIIHexDecode',
'/Length 174',
'>>',
'stream',
'003B00 002700 002480 0E4940 114920 14B220 3CB650',
'75FE88 17FF8C 175F14 1C07E2 3803C4 703182 F8EDFC',
'B2BBC2 BB6F84 31BFC2 18EA3C 0E3E00 07FC00 03F800',
'1E1800 1FF800>',
'endstream',
'endobj'
], LINEEND) + LINEEND)
class PDFType1Font(PDFObject):
def __init__(self, key, font):
self.fontname = font
self.keyname = key
self.template = string.join([
'<<',
'/Type /Font',
'/Subtype /Type1',
'/Name /%s',
'/BaseFont /%s',
'/Encoding /MacRomanEncoding',
'>>'],
LINEEND)
def save(self, file):
file.write(self.template % (self.keyname, self.fontname) + LINEEND)
##############################################################
#
# some helpers
#
##############################################################
def MakeType1Fonts():
"returns a list of all the standard font objects"
fonts = []
pos = 1
for fontname in StandardEnglishFonts:
font = PDFType1Font('F'+str(pos), fontname)
fonts.append(font)
pos = pos + 1
return fonts
def MakeFontDictionary(startpos, count):
"returns a font dictionary assuming they are all in the file from startpos"
dict = " <<" + LINEEND
pos = startpos
for i in range(count):
dict = dict + '\t\t/F%d %d 0 R ' % (i + 1, startpos + i) + LINEEND
dict = dict + "\t\t>>" + LINEEND
return dict
if __name__ == '__main__':
print 'For test scripts, run test1.py to test6.py'