mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
180 lines
6.0 KiB
C++
180 lines
6.0 KiB
C++
//
|
|
// Copyright (c) 2015 Greg Landrum
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
// Implementation details here are taken from the file fpb_io.py from chemfp
|
|
// (www.chemfp.org)
|
|
// Many thanks to Andrew Dalke for creating such great software and for
|
|
// helping explain the FPB implementation
|
|
|
|
#include <DataStructs/ExplicitBitVect.h>
|
|
#include <RDGeneral/Invariant.h>
|
|
#include <RDGeneral/StreamOps.h>
|
|
#include "FPBReader.h"
|
|
#include <boost/cstdint.hpp>
|
|
|
|
namespace RDKit {
|
|
|
|
namespace detail {
|
|
const unsigned int magicSize = 8;
|
|
const std::string FPB_MAGIC("FPB1\r\n\0\0", 8);
|
|
const unsigned int tagNameSize = 4;
|
|
|
|
// the caller is responsible for calling delete[] on `data`
|
|
void readChunk(std::istream &istrm, std::string &nm, boost::uint64_t &sz,
|
|
boost::uint8_t *&data) {
|
|
streamRead(istrm, sz);
|
|
char tag[tagNameSize + 1];
|
|
tag[tagNameSize] = 0;
|
|
istrm.read(tag, tagNameSize);
|
|
nm = tag;
|
|
if (sz) {
|
|
data = new boost::uint8_t[sz];
|
|
istrm.read((char *)data, sz);
|
|
} else {
|
|
data = NULL;
|
|
}
|
|
std::cerr << " CHUNKSZ: " << sz << " name: " << nm << std::endl;
|
|
}
|
|
|
|
struct FPBReader_impl {
|
|
unsigned int len;
|
|
unsigned int nBits;
|
|
boost::uint32_t numBytesStoredPerFingerprint;
|
|
// we're assuming that nothing practical has more than 65K bits set
|
|
std::vector<boost::uint16_t> popCounts;
|
|
const boost::uint8_t *dp_fpData; // do not free this
|
|
const boost::uint8_t *dp_arenaChunk; // this is what should be freed
|
|
};
|
|
|
|
void extractPopCounts(FPBReader_impl *dp_impl, boost::uint64_t sz,
|
|
const boost::uint8_t *chunk) {
|
|
PRECONDITION(dp_impl, "bad pointer");
|
|
if (sz % 4)
|
|
throw ValueErrorException("POPC chunk size must be a multiple of 4 bytes");
|
|
unsigned int nEntries = sz / 4;
|
|
if (nEntries < 9)
|
|
throw ValueErrorException("POPC must contain at least 9 offsets");
|
|
|
|
// FIX: Finish this;
|
|
};
|
|
void extractArena(FPBReader_impl *dp_impl, boost::uint64_t sz,
|
|
const boost::uint8_t *chunk) {
|
|
PRECONDITION(dp_impl, "bad pointer");
|
|
/* Documentation from Andrew's code on the structure of the arena:
|
|
The 'AREN'a starts with a header:
|
|
<num_bytes: 4 bytes> -- the number of bytes in a fingerprint
|
|
<storage_size: 4 bytes> -- number of bytes in fingerprint + extra bytes
|
|
<spacer_size: 1 byte> -- the number of spacer bytes used so the fingerprint
|
|
chunk starts on an aligned file position.
|
|
<spacer : $spacer_size> NUL bytes> -- up to 255 NUL bytes, used for alignment.
|
|
The fingerprints are N fingerprint fields, ordered sequentially.
|
|
<fp0: $storage_size bytes> -- the first fingerprint
|
|
<fp1: $storage_size bytes> -- the second fingerprint
|
|
...
|
|
The last fingerprint ends at the last byte of the arena chunk.
|
|
|
|
Each fingerprint contains:
|
|
<fingerprint: $num_bytes bytes> -- the actual fingerprint data
|
|
<extra: $storage_size-$num_bytes bytes> -- the 'extra' NULL padding bytes
|
|
used so storage_size is a multiple of the alignment.
|
|
|
|
To get the number of fingerprints in the arena:
|
|
(len(arena content) - 4 - 4 - 1 - $spacer_size) // $storage_size
|
|
*/
|
|
boost::uint32_t numBytesPerFingerprint = *((boost::uint32_t *)chunk);
|
|
dp_impl->nBits = numBytesPerFingerprint * 8;
|
|
|
|
chunk += sizeof(boost::uint32_t);
|
|
dp_impl->numBytesStoredPerFingerprint = *((boost::uint32_t *)chunk);
|
|
chunk += sizeof(boost::uint32_t);
|
|
boost::uint8_t spacer = *((boost::uint8_t *)chunk);
|
|
chunk += 1;
|
|
// now move forward the length of the spacer
|
|
chunk += spacer;
|
|
|
|
dp_impl->dp_fpData = chunk;
|
|
dp_impl->len = (sz - 9 - spacer) / dp_impl->numBytesStoredPerFingerprint;
|
|
};
|
|
|
|
// the caller is responsible for delete'ing this
|
|
ExplicitBitVect *extractFP(const FPBReader_impl *dp_impl, unsigned int which) {
|
|
PRECONDITION(dp_impl, "bad reader pointer");
|
|
PRECONDITION(dp_impl->dp_fpData, "bad fpdata pointer");
|
|
|
|
if (which > dp_impl->len) {
|
|
throw ValueErrorException("bad index");
|
|
}
|
|
const boost::uint8_t *fpData =
|
|
dp_impl->dp_fpData + which * dp_impl->numBytesStoredPerFingerprint;
|
|
boost::dynamic_bitset<boost::uint8_t> *fpbs =
|
|
new boost::dynamic_bitset<boost::uint8_t>(fpData,
|
|
fpData + dp_impl->nBits / 8);
|
|
return new ExplicitBitVect((boost::dynamic_bitset<> *)fpbs);
|
|
};
|
|
} // end of detail namespace
|
|
|
|
void FPBReader::init() {
|
|
PRECONDITION(dp_istrm, "no stream");
|
|
dp_impl = new detail::FPBReader_impl;
|
|
|
|
char magic[detail::magicSize];
|
|
dp_istrm->read(magic, detail::magicSize);
|
|
if (detail::FPB_MAGIC != std::string(magic, detail::magicSize)) {
|
|
throw BadFileException("Invalid FPB magic");
|
|
}
|
|
while (1) {
|
|
if (dp_istrm->eof()) throw BadFileException("EOF hit before FEND record");
|
|
std::string chunkNm;
|
|
boost::uint64_t chunkSz;
|
|
boost::uint8_t *chunk = NULL;
|
|
detail::readChunk(*dp_istrm, chunkNm, chunkSz, chunk);
|
|
if (chunkNm == "FEND") {
|
|
break;
|
|
} else if (chunkNm == "POPC") {
|
|
detail::extractPopCounts(dp_impl, chunkSz, chunk);
|
|
} else if (chunkNm == "AREN") {
|
|
dp_impl->dp_arenaChunk = chunk;
|
|
detail::extractArena(dp_impl, chunkSz, chunk);
|
|
chunk = NULL;
|
|
} else if (chunkNm == "FPID") {
|
|
}
|
|
delete[] chunk;
|
|
}
|
|
df_init = true;
|
|
};
|
|
|
|
void FPBReader::destroy() {
|
|
if (dp_impl) delete[] dp_impl->dp_arenaChunk;
|
|
delete dp_impl;
|
|
};
|
|
|
|
ExplicitBitVect *FPBReader::getFP(unsigned int idx) const {
|
|
PRECONDITION(df_init, "not initialized");
|
|
PRECONDITION(dp_impl, "no impl");
|
|
URANGE_CHECK(idx, dp_impl->len);
|
|
|
|
ExplicitBitVect *res = detail::extractFP(dp_impl, idx);
|
|
return res;
|
|
};
|
|
|
|
std::string FPBReader::getId(unsigned int idx) const {
|
|
PRECONDITION(df_init, "not initialized");
|
|
PRECONDITION(dp_impl, "no impl");
|
|
URANGE_CHECK(idx, dp_impl->len);
|
|
// STUB
|
|
std::string res = "ZINC00902219";
|
|
return res;
|
|
};
|
|
unsigned int FPBReader::length() const {
|
|
PRECONDITION(df_init, "not initialized");
|
|
PRECONDITION(dp_impl, "no impl");
|
|
return dp_impl->len;
|
|
};
|
|
} // end of RDKit namespace
|