rdkit/Python/sping/PDF/pdfdoc.py

#pdfdoc.py
"""
PDFgen is a library to generate PDF files containing text and graphics.  It is the
foundation for a complete reporting solution in Python.

The module pdfdoc.py handles the 'outer structure' of PDF documents, ensuring that
all objects are properly cross-referenced and indexed to the nearest byte.  The
'inner structure' - the page descriptions - are presumed to be generated before
each page is saved.
pdfgen.py calls this and provides a 'canvas' object to handle page marking operators.
piddlePDF calls pdfgen and offers a high-level interface.

(C) Copyright Andy Robinson 1998-1999
"""
import os
import sys
import string
import time
import tempfile
import cStringIO
from types import *
from math import sin, cos, pi, ceil

try:
    import zlib
except:
    print "zlib not available, page compression not available"


from pdfgeom import bezierArc

import pdfutils
from pdfutils import LINEEND   # this constant needed in both
import pdfmetrics
##############################################################
#
#            Constants and declarations
#
##############################################################


StandardEnglishFonts = [
    'Courier', 'Courier-Bold', 'Courier-Oblique', 'Courier-BoldOblique',
    'Helvetica', 'Helvetica-Bold', 'Helvetica-Oblique',
    'Helvetica-BoldOblique',
    'Times-Roman', 'Times-Bold', 'Times-Italic', 'Times-BoldItalic',
    'Symbol','ZapfDingbats']

PDFError = 'PDFError'
AFMDIR = '.'

A4 = (595.27,841.89)   #default page size

class PDFDocument:
    """Responsible for linking and writing out the whole document.
    Builds up a list of objects using add(key, object).  Each of these
    must inherit from PDFObject and be able to write itself into the file.
    For cross-linking, it provides getPosition(key) which tells you where
    another object is, or raises a KeyError if not found.  The rule is that
    objects should only refer ones previously written to file.
    """
    def __init__(self):
        self.objects = []
        self.objectPositions = {}

        self.fonts = MakeType1Fonts()

        #mapping of Postscriptfont names to internal ones;
        #needs to be dynamically built once we start adding
        #fonts in.
        self.fontMapping = {}
        for i in range(len(StandardEnglishFonts)):
            psname = StandardEnglishFonts[i]
            pdfname = '/F%d' % (i+1)
            self.fontMapping[psname] = pdfname


        self.pages = []
        self.pagepositions = []

        # position 1
        cat = PDFCatalog()
        cat.RefPages = 3
        cat.RefOutlines = 2
        self.add('Catalog', cat)

        # position 2 - outlines
        outl = PDFOutline()
        self.add('Outline', outl)

        # position 3 - pages collection
        self.PageCol = PDFPageCollection()
        self.add('PagesTreeRoot',self.PageCol)

        # positions 4-16 - fonts
        fontstartpos = len(self.objects) + 1
        for font in self.fonts:
            self.add('Font.'+font.keyname, font)
        self.fontdict = MakeFontDictionary(fontstartpos, len(self.fonts))

        # position 17 - Info
        self.info = PDFInfo()  #hang onto it!
        self.add('Info', self.info)
        self.infopos = len(self.objects)  #1-based, this gives its position


    def add(self, key, obj):
        self.objectPositions[key] = len(self.objects)  # its position
        self.objects.append(obj)
        obj.doc = self
        return len(self.objects) - 1  # give its position

    def getPosition(self, key):
        """Tell you where the given object is in the file - used for
        cross-linking; an object can call self.doc.getPosition("Page001")
        to find out where the object keyed under "Page001" is stored."""
        return self.objectPositions[key]

    def setTitle(self, title):
        "embeds in PDF file"
        self.info.title = title

    def setAuthor(self, author):
        "embedded in PDF file"
        self.info.author = author

    def setSubject(self, subject):
        "embeds in PDF file"
        self.info.subject = subject


    def printXref(self):
        self.startxref = sys.stdout.tell()
        print 'xref'
        print 0,len(self.objects) + 1
        print '0000000000 65535 f'
        for pos in self.xref:
            print '%0.10d 00000 n' % pos

    def writeXref(self, f):
        self.startxref = f.tell()
        f.write('xref' + LINEEND)
        f.write('0 %d' % (len(self.objects) + 1) + LINEEND)
        f.write('0000000000 65535 f' + LINEEND)
        for pos in self.xref:
            f.write('%0.10d 00000 n' % pos + LINEEND)


    def printTrailer(self):
        print 'trailer'
        print '<< /Size %d /Root %d 0 R /Info %d 0 R>>' % (len(self.objects) + 1, 1, self.infopos)
        print 'startxref'
        print self.startxref

    def writeTrailer(self, f):
        f.write('trailer' + LINEEND)
        f.write('<< /Size %d /Root %d 0 R /Info %d 0 R>>' % (len(self.objects) + 1, 1, self.infopos)  + LINEEND)
        f.write('startxref' + LINEEND)
        f.write(str(self.startxref)  + LINEEND)

    def SaveToFile(self, filename):
        fileobj = open(filename, 'wb')
        self.SaveToFileObject(fileobj)
        fileobj.close()

    def SaveToFileObject(self, fileobj):
        """Open a file, and ask each object in turn to write itself to
        the file.  Keep track of the file position at each point for
        use in the index at the end"""
        f = fileobj
        i = 1
        self.xref = []
        f.write("%PDF-1.2" + LINEEND)  # for CID support
        f.write("%íì¶¾" + LINEEND)
        for obj in self.objects:
            pos = f.tell()
            self.xref.append(pos)
            f.write(str(i) + ' 0 obj' + LINEEND)
            obj.save(f)
            f.write('endobj' + LINEEND)
            i = i + 1
        self.writeXref(f)
        self.writeTrailer(f)
        f.write('%%EOF')  # no lineend needed on this one!

        # with the Mac, we need to tag the file in a special
        #way so the system knows it is a PDF file.
        #This supplied by Joe Strout
        if os.name == 'mac':
            import macfs
            try:
                macfs.FSSpec(filename).SetCreatorType('CARO','PDF ')
            except:
                pass


    def printPDF(self):
        "prints it to standard output.  Logs positions for doing trailer"
        print "%PDF-1.0"
        print "%íì¶¾"
        i = 1
        self.xref = []
        for obj in self.objects:
            pos = sys.stdout.tell()
            self.xref.append(pos)
            print i, '0 obj'
            obj.printPDF()
            print 'endobj'
            i = i + 1
        self.printXref()
        self.printTrailer()
        print "%%EOF",

    def addPage(self, page):
        """adds page and stream at end.  Maintains pages list"""
        #page.buildstream()
        pos = len(self.objects) # work out where added

        page.ParentPos = 3   #pages collection
        page.info = {
            'parentpos':3,
            'fontdict':self.fontdict,
            'contentspos':pos + 2,
            }

        self.PageCol.PageList.append(pos+1)
        self.add('Page%06d'% len(self.PageCol.PageList), page)
        #self.objects.append(page)
        self.add('PageStream%06d'% len(self.PageCol.PageList), page.stream)
        #self.objects.append(page.stream)

    def hasFont(self, psfontname):
        return self.fontMapping.has_key(psfontname)

    def getInternalFontName(self, psfontname):
        try:
            return self.fontMapping[psfontname]
        except:
            raise PDFError, "Font %s not available in document" % psfontname

    def getAvailableFonts(self):
        fontnames = self.fontMapping.keys()
        fontnames.sort()
        return fontnames

##############################################################
#
#            Utilities
#
##############################################################

class OutputGrabber:
    """At times we need to put something in the place of standard
    output.  This grabs stdout, keeps the data, and releases stdout
    when done.

    NOT working well enough!"""
    def __init__(self):
        self.oldoutput = sys.stdout
        sys.stdout = self
        self.closed = 0
        self.data = []
    def write(self, x):
        if not self.closed:
            self.data.append(x)

    def getData(self):
        return string.join(self.data)

    def close(self):
        sys.stdout = self.oldoutput
        self.closed = 1

    def __del__(self):
        if not self.closed:
            self.close()


def testOutputGrabber():
    gr = OutputGrabber()
    for i in range(10):
        print 'line',i
    data = gr.getData()
    gr.close()
    print 'Data...',data


##############################################################
#
#            PDF Object Hierarchy
#
##############################################################


class PDFObject:
    """Base class for all PDF objects.  In PDF, precise measurement
    of file offsets is essential, so the usual trick of just printing
    and redirecting output has proved to give different behaviour on
    Mac and Windows.  While it might be soluble, I'm taking charge
    of line ends at the binary level and explicitly writing to a file.
    The LINEEND constant lets me try CR, LF and CRLF easily to help
    pin down the problem."""
    def save(self, file):
        "Save its content to an open file"
        file.write('% base PDF object' + LINEEND)
    def printPDF(self):
        self.save(sys.stdout)


class PDFLiteral(PDFObject):
    " a ready-made one you wish to quote"
    def __init__(self, text):
        self.text = text
    def save(self, file):
        file.write(self.text + LINEEND)


class PDFCatalog(PDFObject):
    "requires RefPages and RefOutlines set"
    def __init__(self):
        self.template = string.join([
                        '<<',
                        '/Type /Catalog',
                        '/Pages %d 0 R',
                        '/Outlines %d 0 R',
                        '>>'
                        ],LINEEND
                        )
    def save(self, file):
        file.write(self.template % (self.RefPages, self.RefOutlines) + LINEEND)


class PDFInfo(PDFObject):
    """PDF documents can have basic information embedded, viewable from
    File | Document Info in Acrobat Reader.  If this is wrong, you get
    Postscript errors while printing, even though it does not print."""
    def __init__(self):
        self.title = "untitled"
        self.author = "anonymous"
        self.subject = "unspecified"

        now = time.localtime(time.time())
        self.datestr = '%04d%02d%02d%02d%02d%02d' % tuple(now[0:6])

    def save(self, file):
        file.write(string.join([
                "<</Title (%s)",
                "/Author (%s)",
                "/CreationDate (D:%s)",
                "/Producer (PDFgen)",
                "/Subject (%s)",
                ">>"
                ], LINEEND
            ) % (
    pdfutils._escape(self.title),
    pdfutils._escape(self.author),
    self.datestr,
    pdfutils._escape(self.subject)
    ) + LINEEND)


class PDFOutline(PDFObject):
    "null outline, does nothing yet"
    def __init__(self):
        self.template = string.join([
                '<<',
                '/Type /Outlines',
                '/Count 0',
                '>>'],
                LINEEND)
    def save(self, file):
        file.write(self.template + LINEEND)


class PDFPageCollection(PDFObject):
    "presumes PageList attribute set (list of integers)"
    def __init__(self):
        self.PageList = []

    def save(self, file):
        lines = [ '<<',
                '/Type /Pages',
                '/Count %d' % len(self.PageList),
                '/Kids ['
                ]
        for page in self.PageList:
            lines.append(str(page) + ' 0 R ')
        lines.append(']')
        lines.append('>>')
        text = string.join(lines, LINEEND)
        file.write(text + LINEEND)


class PDFPage(PDFObject):
    """The Bastard.  Needs list of Resources etc. Use a standard one for now.
    It manages a PDFStream object which must be added to the document's list
    of objects as well."""
    def __init__(self):
        self.drawables = []
        self.pagewidth = 595  #these are overridden by piddlePDF
        self.pageheight = 842
        self.stream = PDFStream()
        self.hasImages = 0
        self.pageTransitionString = ''  # presentation effects
        # editors on different systems may put different things in the line end
        # without me noticing.  No triple-quoted strings allowed!
        self.template = string.join([
                '<<',
                '/Type /Page',
                '/Parent %(parentpos)d 0 R',
                '/Resources',
                '   <<',
                '   /Font %(fontdict)s',
                '   /ProcSet %(procsettext)s',
                '   >>',
                '/MediaBox [0 0 %(pagewidth)d %(pageheight)d]',  #A4 by default
                '/Contents %(contentspos)d 0 R',
                '%(transitionString)s',
                '>>'],
            LINEEND)
    def setCompression(self, onoff=0):
        "Turns page compression on or off"
        assert onoff in [0,1], "Page compression options are 1=on, 2=off"
        self.stream.compression = onoff

    def save(self, file):
        self.info['pagewidth'] = self.pagewidth
        self.info['pageheight'] = self.pageheight
        # check for image support
        if self.hasImages:
            self.info['procsettext'] = '[/PDF /Text /ImageC]'
        else:
            self.info['procsettext'] = '[/PDF /Text]'
        self.info['transitionString'] = self.pageTransitionString

        file.write(self.template % self.info + LINEEND)

    def clear(self):
        self.drawables = []

    def setStream(self, data):
        if type(data) is ListType:
            data = string.join(data, LINEEND)
        self.stream.setStream(data)

TestStream = "BT /F6 24 Tf 80 672 Td 24 TL (   ) Tj T* ET"


class PDFStream(PDFObject):
    "Used for the contents of a page"
    def __init__(self):
        self.data = None
        self.compression = 0

    def setStream(self, data):
        self.data = data

    def save(self, file):
        #avoid crashes if they wrote nothing in the page
        if self.data == None:
             self.data = TestStream

        if self.compression == 1:
            comp = zlib.compress(self.data)   #this bit is very fast...
            base85 = pdfutils._AsciiBase85Encode(comp) #...sadly this isn't
            wrapped = pdfutils._wrap(base85)
            data_to_write = wrapped
        else:
            data_to_write = self.data
        # the PDF length key should contain the length including
        # any extra LF pairs added by Print on DOS.

        #lines = len(string.split(self.data,'\n'))
        #length = len(self.data) + lines   # one extra LF each
        length = len(data_to_write) + len(LINEEND)    #AR 19980202
        if self.compression:
            file.write('<< /Length %d /Filter [/ASCII85Decode /FlateDecode]>>' % length + LINEEND)
        else:
            file.write('<< /Length %d >>' % length + LINEEND)
        file.write('stream' + LINEEND)
        file.write(data_to_write + LINEEND)
        file.write('endstream' + LINEEND)

class PDFImage(PDFObject):
    # sample one while developing.  Currently, images go in a literals
    def save(self, file):
        file.write(string.join([
                '<<',
                '/Type /XObject',
                '/Subtype /Image',
                '/Name /Im0',
                '/Width 24',
                '/Height 23',
                '/BitsPerComponent 1',
                '/ColorSpace /DeviceGray',
                '/Filter /ASCIIHexDecode',
                '/Length 174',
                '>>',
                'stream',
                '003B00 002700 002480 0E4940 114920 14B220 3CB650',
                '75FE88 17FF8C 175F14 1C07E2 3803C4 703182 F8EDFC',
                'B2BBC2 BB6F84 31BFC2 18EA3C 0E3E00 07FC00 03F800',
                '1E1800 1FF800>',
                'endstream',
                'endobj'
                ], LINEEND) + LINEEND)

class PDFType1Font(PDFObject):
    def __init__(self, key, font):
        self.fontname = font
        self.keyname = key
        self.template = string.join([
                    '<<',
                    '/Type /Font',
                    '/Subtype /Type1',
                    '/Name /%s',
                    '/BaseFont /%s',
                    '/Encoding /MacRomanEncoding',
                    '>>'],
                    LINEEND)
    def save(self, file):
        file.write(self.template % (self.keyname, self.fontname) + LINEEND)


##############################################################
#
#            some helpers
#
##############################################################

def MakeType1Fonts():
    "returns a list of all the standard font objects"
    fonts = []
    pos = 1
    for fontname in StandardEnglishFonts:
        font = PDFType1Font('F'+str(pos), fontname)
        fonts.append(font)
        pos = pos + 1
    return fonts

def MakeFontDictionary(startpos, count):
    "returns a font dictionary assuming they are all in the file from startpos"
    dict = "  <<" + LINEEND
    pos = startpos
    for i in range(count):
        dict = dict + '\t\t/F%d %d 0 R ' % (i + 1, startpos + i) + LINEEND
    dict = dict + "\t\t>>" + LINEEND
    return dict

if __name__ == '__main__':
    print 'For test scripts, run test1.py to test6.py'