rdkit/Python/sping/PDF/pdfutils.py

# pdfutils.py - everything to do with images, streams,
# compression, and some constants

import os
import string
import cStringIO

LINEEND = '\015\012'

##########################################################
#
#  image compression helpers.  Preprocessing a directory
#  of images will offer a vast speedup.
#
##########################################################
def cacheImageFile(filename):
    "Processes the image as if for encoding, saves to a file ending in AHX"
    from PIL import Image
    import zlib
    img1 = Image.open(filename)
    img = img1.convert('RGB')
    imgwidth, imgheight = img.size
    code = []
    code.append('BI')   # begin image
    # this describes what is in the image itself
    code.append('/W %s /H %s /BPC 8 /CS /RGB /F [/A85 /Fl]' % (imgwidth, imgheight))
    code.append('ID')
    #use a flate filter and Ascii Base 85
    raw = img.tostring()
    assert(len(raw) == imgwidth * imgheight, "Wrong amount of data for image")
    compressed = zlib.compress(raw)   #this bit is very fast...
    encoded = _AsciiBase85Encode(compressed) #...sadly this isn't

    #write in blocks of 60 characters per line
    outstream = cStringIO.StringIO(encoded)
    dataline = outstream.read(60)
    while dataline <> "":
        code.append(dataline)
        dataline = outstream.read(60)

    code.append('EI')

    #save it to a file
    cachedname = os.path.splitext(filename)[0] + '.a85'
    f = open(cachedname,'wb')
    f.write(string.join(code, LINEEND)+LINEEND)
    f.close()
    print 'cached image as %s' % cachedname


def preProcessImages(spec):
    """accepts either a filespec ('C:\mydir\*.jpg') or a list
    of image filenames, crunches them all to save time.  Run this
    to save huge amounts of time when repeatedly building image
    documents."""
    import types
    if type(spec) is types.StringType:
        filelist = glob.glob(spec)
    else:  #list or tuple OK
        filelist = spec

    for filename in filelist:
        if cachedImageExists(filename):
            print 'cached version of %s already exists' % filename
        else:
            cacheImageFile(filename)


def cachedImageExists(filename):
    """Determines if a cached image exists which has
    the same name and equal or newer date to the given
    file."""
    cachedname = os.path.splitext(filename)[0] + '.a85'
    if os.path.isfile(cachedname):
        #see if it is newer
        original_date = os.stat(filename)[8]
        cached_date = os.stat(cachedname)[8]
        if original_date > cached_date:
            return 0
        else:
            return 1
    else:
        return 0


##############################################################
#
#            PDF Helper functions
#
##############################################################
def _escape(s):
    """PDF escapes are almost like Python ones, but brackets
    need slashes before them too. Use Python's repr function
    and chop off the quotes first"""
    s = repr(s)[1:-1]
    s = string.replace(s, '(','\(')
    s = string.replace(s, ')','\)')
    return s


def _normalizeLineEnds(text,desired=LINEEND):
    """ensures all instances of CR, LF and CRLF end up as the specified one"""
    unlikely = '\000\001\002\003'
    text = string.replace(text, '\015\012', unlikely)
    text = string.replace(text, '\015', unlikely)
    text = string.replace(text, '\012', unlikely)
    text = string.replace(text, unlikely, desired)
    return text

def _AsciiHexEncode(input):
    """This is a verbose encoding used for binary data within
    a PDF file.  One byte binary becomes two bytes of ASCII."""
    "Helper function used by images"
    output = cStringIO.StringIO()
    for char in input:
        output.write('%02x' % ord(char))
    output.write('>')
    output.reset()
    return output.read()

def _AsciiHexDecode(input):
    "Not used except to provide a test of the preceding"
    #strip out all whitespace
    stripped = string.join(string.split(input),'')
    assert stripped[-1] == '>', 'Invalid terminator for Ascii Hex Stream'
    stripped = stripped[:-1]  #chop off terminator
    assert len(stripped) % 2 == 0, 'Ascii Hex stream has odd number of bytes'
    i = 0
    output = cStringIO.StringIO()
    while i < len(stripped):
        twobytes = stripped[i:i+2]
        output.write(chr(eval('0x'+twobytes)))
        i = i + 2
    output.reset()
    return output.read()

def _AsciiHexTest(text='What is the average velocity of a sparrow?'):
    "Do the obvious test for whether Ascii Hex encoding works"
    print 'Plain text:', text
    encoded = _AsciiHexEncode(text)
    print 'Encoded:', encoded
    decoded = _AsciiHexDecode(encoded)
    print 'Decoded:', decoded
    if decoded == text:
        print 'Passed'
    else:
        print 'Failed!'

def _AsciiBase85Encode(input):
    """This is a compact encoding used for binary data within
    a PDF file.  Four bytes of binary data become five bytes of
    ASCII.  This is the default method used for encoding images."""
    outstream = cStringIO.StringIO()
    # special rules apply if not a multiple of four bytes.
    whole_word_count, remainder_size = divmod(len(input), 4)
    cut = 4 * whole_word_count
    body, lastbit = input[0:cut], input[cut:]

    for i in range(whole_word_count):
        offset = i*4
        b1 = ord(body[offset])
        b2 = ord(body[offset+1])
        b3 = ord(body[offset+2])
        b4 = ord(body[offset+3])

        num = 16777216L * b1 + 65536 * b2 + 256 * b3 + b4

        if num == 0:
            #special case
            outstream.write('z')
        else:
            #solve for five base-85 numbers
            temp, c5 = divmod(num, 85)
            temp, c4 = divmod(temp, 85)
            temp, c3 = divmod(temp, 85)
            c1, c2 = divmod(temp, 85)
            assert ((85**4) * c1) + ((85**3) * c2) + ((85**2) * c3) + (85*c4) + c5 == num, 'dodgy code!'
            outstream.write(chr(c1+33))
            outstream.write(chr(c2+33))
            outstream.write(chr(c3+33))
            outstream.write(chr(c4+33))
            outstream.write(chr(c5+33))

    # now we do the final bit at the end.  I repeated this separately as
    # the loop above is the time-critical part of a script, whereas this
    # happens only once at the end.

    #encode however many bytes we have as usual
    if remainder_size > 0:
        while len(lastbit) < 4:
            lastbit = lastbit + '\000'
        b1 = ord(lastbit[0])
        b2 = ord(lastbit[1])
        b3 = ord(lastbit[2])
        b4 = ord(lastbit[3])

        num = 16777216L * b1 + 65536 * b2 + 256 * b3 + b4

        #solve for c1..c5
        temp, c5 = divmod(num, 85)
        temp, c4 = divmod(temp, 85)
        temp, c3 = divmod(temp, 85)
        c1, c2 = divmod(temp, 85)

        #print 'encoding: %d %d %d %d -> %d -> %d %d %d %d %d' % (
        #    b1,b2,b3,b4,num,c1,c2,c3,c4,c5)
        lastword = chr(c1+33) + chr(c2+33) + chr(c3+33) + chr(c4+33) + chr(c5+33)
        #write out most of the bytes.
        outstream.write(lastword[0:remainder_size + 1])

    #terminator code for ascii 85
    outstream.write('~>')
    outstream.reset()
    return outstream.read()


def _AsciiBase85Decode(input):
    """This is not used - Acrobat Reader decodes for you - but a round
    trip is essential for testing."""
    outstream = cStringIO.StringIO()
    #strip all whitespace
    stripped = string.join(string.split(input),'')
    #check end
    assert stripped[-2:] == '~>', 'Invalid terminator for Ascii Base 85 Stream'
    stripped = stripped[:-2]  #chop off terminator

    #may have 'z' in it which complicates matters - expand them
    stripped = string.replace(stripped,'z','!!!!!')
    # special rules apply if not a multiple of five bytes.
    whole_word_count, remainder_size = divmod(len(stripped), 5)
    #print '%d words, %d leftover' % (whole_word_count, remainder_size)
    assert remainder_size <> 1, 'invalid Ascii 85 stream!'
    cut = 5 * whole_word_count
    body, lastbit = stripped[0:cut], stripped[cut:]

    for i in range(whole_word_count):
        offset = i*5
        c1 = ord(body[offset]) - 33
        c2 = ord(body[offset+1]) - 33
        c3 = ord(body[offset+2]) - 33
        c4 = ord(body[offset+3]) - 33
        c5 = ord(body[offset+4]) - 33

        num = ((85**4) * c1) + ((85**3) * c2) + ((85**2) * c3) + (85*c4) + c5

        temp, b4 = divmod(num,256)
        temp, b3 = divmod(temp,256)
        b1, b2 = divmod(temp, 256)

        assert  num == 16777216 * b1 + 65536 * b2 + 256 * b3 + b4, 'dodgy code!'
        outstream.write(chr(b1))
        outstream.write(chr(b2))
        outstream.write(chr(b3))
        outstream.write(chr(b4))

    #decode however many bytes we have as usual
    if remainder_size > 0:
        while len(lastbit) < 5:
            lastbit = lastbit + '!'
        c1 = ord(lastbit[0]) - 33
        c2 = ord(lastbit[1]) - 33
        c3 = ord(lastbit[2]) - 33
        c4 = ord(lastbit[3]) - 33
        c5 = ord(lastbit[4]) - 33
        num = ((85**4) * c1) + ((85**3) * c2) + ((85**2) * c3) + (85*c4) + c5
        temp, b4 = divmod(num,256)
        temp, b3 = divmod(temp,256)
        b1, b2 = divmod(temp, 256)
        assert  num == 16777216 * b1 + 65536 * b2 + 256 * b3 + b4, 'dodgy code!'
        #print 'decoding: %d %d %d %d %d -> %d -> %d %d %d %d' % (
        #    c1,c2,c3,c4,c5,num,b1,b2,b3,b4)

        #the last character needs 1 adding; the encoding loses
        #data by rounding the number to x bytes, and when
        #divided repeatedly we get one less
        if remainder_size == 2:
            lastword = chr(b1+1)
        elif remainder_size == 3:
            lastword = chr(b1) + chr(b2+1)
        elif remainder_size == 4:
            lastword = chr(b1) + chr(b2) + chr(b3+1)
        outstream.write(lastword)

    #terminator code for ascii 85
    outstream.reset()
    return outstream.read()

def _wrap(input, columns=60):
    output = []
    length = len(input)
    i = 0
    pos = columns * i
    while pos < length:
        output.append(input[pos:pos+columns])
        i = i + 1
        pos = columns * i

    return string.join(output, LINEEND)


def _AsciiBase85Test(text='What is the average velocity of a sparrow?'):
    "Do the obvious test for whether Base 85 encoding works"
    print 'Plain text:', text
    encoded = _AsciiBase85Encode(text)
    print 'Encoded:', encoded
    decoded = _AsciiBase85Decode(encoded)
    print 'Decoded:', decoded
    if decoded == text:
        print 'Passed'
    else:
        print 'Failed!'