Files
rdkit/Python/Dbase/DbUtils.py

621 lines
17 KiB
Python
Executable File

# $Id$
#
# Copyright (C) 2000-2006 greg Landrum and Rational Discovery LLC
#
# @@ All Rights Reserved @@
#
""" a set of functions for interacting with databases
When possible, it's probably preferable to use a _DbConnection.DbConnect_ object
"""
import RDConfig
try:
from Excel import ExcelWrapper
except:
haveExcel = 0
else:
haveExcel = 1
from Dbase.DbResultSet import DbResultSet,RandomAccessDbResultSet
def _take(fromL,what):
return map(lambda x,y=fromL:y[x],what)
from Dbase import DbModule
import sys,types,string
from Dbase import DbInfo
def GetColumns(dBase,table,fieldString,user='sysdba',password='masterkey',
join=''):
""" gets a set of data from a table
**Arguments**
- dBase: database name
- table: table name
- fieldString: a string with the names of the fields to be extracted,
this should be a comma delimited list
- user and password:
- join: a join clause (omit the verb 'join')
**Returns**
- a list of the data
"""
cn = DbModule.connect(dBase,user,password)
c = cn.cursor()
cmd = 'select %s from %s'%(fieldString,table)
if join:
if join.strip().find('join') != 0:
join = 'join %s'%(join)
cmd +=' ' + join
c.execute(cmd)
return c.fetchall()
def GetData(dBase,table,fieldString='*',whereString='',user='sysdba',password='masterkey',
removeDups=-1,join='',forceList=0,transform=None,randomAccess=1,extras=None):
""" a more flexible method to get a set of data from a table
**Arguments**
- fields: a string with the names of the fields to be extracted,
this should be a comma delimited list
- where: the SQL where clause to be used with the DB query
- removeDups indicates the column which should be used to screen
out duplicates. Only the first appearance of a duplicate will
be left in the dataset.
**Returns**
- a list of the data
**Notes**
- EFF: this isn't particularly efficient
"""
cn = DbModule.connect(dBase,user,password)
c = cn.cursor()
cmd = 'select %s from %s'%(fieldString,table)
if join:
if join.strip().find('join') != 0:
join = 'join %s'%(join)
cmd += ' ' + join
if whereString:
if whereString.strip().find('where')!=0:
whereString = 'where %s'%(whereString)
cmd += ' ' + whereString
if forceList:
try:
if not extras:
c.execute(cmd)
else:
c.execute(cmd,extras)
except:
sys.stderr.write('the command "%s" generated errors:\n'%(cmd))
import traceback
traceback.print_exc()
return None
if transform is not None:
raise ValueError,'forceList and transform arguments are not compatible'
if not randomAccess:
raise ValueError,'when forceList is set, randomAccess must also be used'
data = c.fetchall()
if removeDups>0:
seen = []
for entry in data[:]:
if entry[removeDups] in seen:
data.remove(entry)
else:
seen.append(entry[removeDups])
else:
if randomAccess:
klass = RandomAccessDbResultSet
else:
klass = DbResultSet
data = klass(c,cn,cmd,removeDups=removeDups,transform=transform,extras=extras)
return data
def DatabaseToExcel(dBase,table,fields='*',join='',where='',wrapper=None,
user='sysdba',password='masterkey',lowMemory=False):
""" Pulls the contents of a database and puts them in an Excel worksheet
**Arguments**
- dBase: the name of the DB file to be used
- table: the name of the table to query
- fields: the fields to select with the SQL query
- join: the join clause of the SQL query
(e.g. 'join foo on foo.bar=base.bar')
- where: the where clause of the SQL query
(e.g. 'where foo = 2' or 'where bar > 17.6')
- wrapper: an _Excel.ExcelWrapper.ExcelWrapper_ to be used
in interacting with Excel
- user: the username for DB access
- password: the password to be used for DB access
"""
if not haveExcel:
return
if wrapper is None:
wrapper = ExcelWrapper.ExcelWrapper()
wrapper.Visible = 1
if len(where) and where.strip().find('where') != 0:
where = 'where %s'%(where)
if len(join) and join.strip().find('join') != 0:
join = 'join %s'%(join)
sqlCommand = 'select %s from %s %s %s'%(fields,table,join,where)
cn = DbModule.connect(dBase,user,password)
c = cn.cursor()
try:
c.execute(sqlCommand)
except:
print 'problems executing SQL statement %s'%(repr(sqlCommand))
import sys,traceback
traceback.print_exc()
return
headers = []
colsToTake = []
strCols = []
# the description field of the cursor carries around info about the columns
# of the table
for i in range(len(c.description)):
item = c.description[i]
if item[1] not in DbInfo.sqlBinTypes:
colsToTake.append(i)
headers.append(item[0])
if item[1] in DbInfo.sqlTextTypes:
strCols.append(len(colsToTake)-1)
wrapper.Workbooks.Add()
r = wrapper.GetRange(1,1,1,len(headers))
# add the headers
r.Value = headers
# and make them bold
r.Font.Bold = 1
# now just insert the data... easy as pie
results = c.fetchall()
row = 2
fullData = []
for res in results:
vs = [res[x] for x in colsToTake]
for col in strCols:
vs[col] = "'%s"%vs[col]
if not lowMemory:
fullData.append(vs)
else:
wrapper[row,1:len(headers)]=vs
row+=1
if not lowMemory:
wrapper[row:row+len(fullData),1:len(headers)] = fullData
def DatabaseToText(dBase,table,fields='*',join='',where='',wrapper=None,
user='sysdba',password='masterkey',delim=','):
""" Pulls the contents of a database and makes a deliminted text file from them
**Arguments**
- dBase: the name of the DB file to be used
- table: the name of the table to query
- fields: the fields to select with the SQL query
- join: the join clause of the SQL query
(e.g. 'join foo on foo.bar=base.bar')
- where: the where clause of the SQL query
(e.g. 'where foo = 2' or 'where bar > 17.6')
- wrapper: an _Excel.ExcelWrapper.ExcelWrapper_ to be used
in interacting with Excel
- user: the username for DB access
- password: the password to be used for DB access
**Returns**
- the CSV data (as text)
"""
if len(where) and where.strip().find('where')==-1:
where = 'where %s'%(where)
if len(join) and join.strip().find('join') == -1:
join = 'join %s'%(join)
sqlCommand = 'select %s from %s %s %s'%(fields,table,join,where)
cn = DbModule.connect(dBase,user,password)
c = cn.cursor()
c.execute(sqlCommand)
headers = []
colsToTake = []
# the description field of the cursor carries around info about the columns
# of the table
for i in range(len(c.description)):
item = c.description[i]
if item[1] not in DbInfo.sqlBinTypes:
colsToTake.append(i)
headers.append(item[0])
lines = []
lines.append(delim.join(headers))
# grab the data
results = c.fetchall()
for res in results:
d = _take(res,colsToTake)
lines.append(delim.join(map(str,d)))
return '\n'.join(lines)
def TypeFinder(data,nRows,nCols,nullMarker=None):
"""
finds the types of the columns in _data_
if nullMarker is not None, elements of the data table which are
equal to nullMarker will not count towards setting the type of
their columns.
"""
priorities={types.FloatType:3,types.IntType:2,types.StringType:1,-1:-1}
res = [None]*nCols
for col in xrange(nCols):
typeHere = [-1,1]
for row in xrange(nRows):
d = data[row][col]
if d is not None:
locType = type(d)
if locType != types.FloatType and locType != types.IntType:
locType = types.StringType
try:
d = str(d)
except UnicodeError,msg:
print 'cannot convert text from row %d col %d to a string'%(row+2,col)
print '\t>%s'%(repr(d))
raise UnicodeError,msg
else:
typeHere[1] = max(typeHere[1],len(str(d)))
if locType == types.StringType:
if nullMarker is None or d != nullMarker:
l = max(len(d),typeHere[1])
typeHere = [types.StringType,l]
else:
try:
fD = float(int(d))
except OverflowError:
locType = types.FloatType
else:
if fD == d:
locType = types.IntType
if typeHere[0]!=types.StringType and \
priorities[locType] > priorities[typeHere[0]]:
typeHere[0] = locType
res[col] = typeHere
return res
def DetermineColTypes(wrapper,nullMarker=None):
"""This is kind of crude hack to automagically determine the types
of columns in the active Excel sheet
**Arguments**
- wrapper: the _ExcelWrapper_ to be used in interacting with Excel
- nullMarker: (optional) if this is not None, elements of the
data table which are equal to nullMarker will not count towards
setting the type of their columns.
**Returns**
- a list of the types of each column
**Note**
- we make the assumption that there are only three possible types: int,
float and string.
"""
nCols = wrapper.FindLastCol(1,1)
nRows = wrapper.FindLastRow(1,1)
dList = wrapper[2:nRows,1:nCols]
res = TypeFinder(dList,nRows-1,nCols,nullMarker=nullMarker)
return res
def _AdjustColHeadings(colHeadings,maxColLabelLen):
""" *For Internal Use*
removes illegal characters from column headings
and truncates those which are too long.
"""
for i in xrange(len(colHeadings)):
# replace unallowed characters and strip extra white space
colHeadings[i] = string.strip(colHeadings[i])
colHeadings[i] = string.replace(colHeadings[i],' ','_')
colHeadings[i] = string.replace(colHeadings[i],'-','_')
if len(colHeadings[i]) > maxColLabelLen:
# interbase (at least) has a limit on the maximum length of a column name
newHead = string.replace(colHeadings[i],'_','')
newHead = newHead[:maxColLabelLen]
print '\tHeading %s too long, changed to %s'%(colHeadings[i],newHead)
colHeadings[i] = newHead
return colHeadings
def GetTypeStrings(colHeadings,colTypes,keyCol=None):
""" returns a list of SQL type strings
"""
typeStrs=[]
for i in xrange(len(colTypes)):
type = colTypes[i]
if type[0] == types.FloatType:
typeStrs.append('%s double precision'%colHeadings[i])
elif type[0] == types.IntType:
typeStrs.append('%s integer'%colHeadings[i])
else:
typeStrs.append('%s varchar(%d)'%(colHeadings[i],type[1]))
if colHeadings[i] == keyCol:
typeStrs[-1] = '%s not null primary key'%(typeStrs[-1])
return typeStrs
def _insertBlock(conn,sqlStr,block,silent=False):
try:
conn.cursor().executemany(sqlStr,block)
except:
res = 0
conn.commit()
for row in block:
try:
conn.cursor().execute(sqlStr,tuple(row))
res += 1
except:
if not silent:
import traceback
traceback.print_exc()
print 'insert failed:',sqlStr
print '\t',repr(row)
else:
conn.commit()
else:
res = len(block)
return res
def _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data,
nullMarker=None,blockSize=100):
""" *For Internal Use*
(drops and) creates a table and then inserts the values
"""
cn = DbModule.connect(dBase,user,password)
c = cn.cursor()
try:
c.execute('drop table %s'%(table))
except:
print 'cannot drop table %s'%(table)
try:
sqlStr = 'create table %s (%s)'%(table,colDefs)
c.execute(sqlStr)
except:
print 'create table failed: ', sqlStr
print 'here is the exception:'
import traceback
traceback.print_exc()
return
cn.commit()
c = None
block = []
entryTxt = [DbModule.placeHolder]*len(data[0])
dStr = ','.join(entryTxt)
sqlStr = 'insert into %s values (%s)'%(table,dStr)
nDone = 0
for row in data:
entries = [None]*len(row)
for col in xrange(len(row)):
if row[col] is not None and \
(nullMarker is None or row[col] != nullMarker):
if colTypes[col][0] == types.FloatType:
entries[col] = float(row[col])
elif colTypes[col][0] == types.IntType:
entries[col] = int(row[col])
else:
entries[col] = str(row[col])
else:
entries[col] = None
block.append(tuple(entries))
if len(block)>=blockSize:
nDone += _insertBlock(cn,sqlStr,block)
if not hasattr(cn,'autocommit') or not cn.autocommit:
cn.commit()
block = []
if len(block):
nDone += _insertBlock(cn,sqlStr,block)
if not hasattr(cn,'autocommit') or not cn.autocommit:
cn.commit()
def ExcelToDatabase(dBase,table,wrapper=None,user='sysdba',password='masterkey',
maxColLabelLen=31,keyCol=None,nullMarker=None,force=0):
"""convert the active excel worksheet into a database.
this isn't as smooth or slick as the conversion the other way... sad.
**Arguments**
- dBase: the name of the DB to use
- table: the name of the table to create/overwrite
- wrapper: the _ExcelWrapper_ to use
- user: the user name to use in connecting to the DB
- password: the password to use in connecting to the DB
- maxColLabelLen: the maximum length a column label should be
allowed to have (truncation otherwise)
- keyCol: the column to be used as an index for the db
**Notes**
- if _table_ already exists, it is destroyed before we write
the new data
"""
if not haveExcel:
return
table.replace('-','_')
table.replace(' ','_')
if not force:
tblNames = [x.strip() for x in DbInfo.GetTableNames(dBase)]
tmp = table.upper()
if tmp in tblNames:
resp = raw_input('Table %s already exists, overwrite it? '%(table))
if not resp or resp[0] not in ['Y','y']:
print 'cancelled'
return
if wrapper is None:
wrapper = ExcelWrapper.ExcelWrapper()
colHeadings = wrapper.GetHeadings()
_AdjustColHeadings(colHeadings,maxColLabelLen)
nCols = len(colHeadings)
nRows = wrapper.FindLastRow(1,1)
data = wrapper[2:nRows,1:nCols]
# determine the types of each column
colTypes = TypeFinder(data,nRows-1,nCols,nullMarker=nullMarker)
typeStrs = GetTypeStrings(colHeadings,colTypes,keyCol=keyCol)
colDefs=','.join(typeStrs)
_AddDataToDb(dBase,table,user,password,colDefs,colTypes,data,nullMarker=nullMarker)
def TextFileToDatabase(dBase,table,inF,delim=',',
user='sysdba',password='masterkey',
maxColLabelLen=31,keyCol=None,nullMarker=None):
"""loads the contents of the text file into a database.
**Arguments**
- dBase: the name of the DB to use
- table: the name of the table to create/overwrite
- inF: the file like object from which the data should
be pulled (must support readline())
- delim: the delimiter used to separate fields
- user: the user name to use in connecting to the DB
- password: the password to use in connecting to the DB
- maxColLabelLen: the maximum length a column label should be
allowed to have (truncation otherwise)
- keyCol: the column to be used as an index for the db
**Notes**
- if _table_ already exists, it is destroyed before we write
the new data
- we assume that the first row of the file contains the column names
"""
table.replace('-','_')
table.replace(' ','_')
colHeadings = inF.readline().split(delim)
_AdjustColHeadings(colHeadings,maxColLabelLen)
nCols = len(colHeadings)
data = []
inL = inF.readline()
while inL:
inL = inL.replace('\r','')
inL = inL.replace('\n','')
splitL = inL.split(delim)
if len(splitL)!=nCols:
print '>>>',repr(inL)
assert len(splitL)==nCols,'unequal length'
tmpVect = []
for entry in splitL:
try:
val = int(entry)
except:
try:
val = float(entry)
except:
val = entry
tmpVect.append(val)
data.append(tmpVect)
inL = inF.readline()
nRows = len(data)
# determine the types of each column
colTypes = TypeFinder(data,nRows,nCols,nullMarker=nullMarker)
typeStrs = GetTypeStrings(colHeadings,colTypes,keyCol=keyCol)
colDefs=','.join(typeStrs)
_AddDataToDb(dBase,table,user,password,colDefs,colTypes,data,
nullMarker=nullMarker)
def DatabaseToDatabase(fromDb,fromTbl,toDb,toTbl,
fields='*',join='',where='',
user='sysdba',password='masterkey',keyCol=None,nullMarker='None'):
"""
FIX: at the moment this is a hack
"""
import cStringIO
io = cStringIO.StringIO()
io.write(DatabaseToText(fromDb,fromTbl,fields=fields,join=join,where=where,
user=user,password=password))
io.seek(-1)
TextFileToDatabase(toDb,toTbl,io,user=user,password=password,keyCol=keyCol,
nullMarker=nullMarker)
if __name__=='__main__':
import cStringIO
io = cStringIO.StringIO()
io.write('foo,bar,baz\n')
io.write('1,2,3\n')
io.write('1.1,4,5\n')
io.write('4,foo,6\n')
io.seek(0)
import RDConfig,os
dirLoc = os.path.join(RDConfig.RDCodeDir,'Dbase','TEST.GDB')
TextFileToDatabase(dirLoc,'fromtext',io)