Merge simple AND queries onto atoms. (#8830)

* duplicate parser code

* regenerate smarts.tab files

* update test

* add release note

* restore pregenerated header

* Suggested changes (#23)

* add a generic flags interface to atoms and bonds

* suggested changes

---------

Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
This commit is contained in:
Ricardo Rodriguez
2025-10-09 04:38:42 +02:00
committed by greg landrum
parent cd42cf18ea
commit cf64184339
10 changed files with 451 additions and 283 deletions

View File

@@ -216,6 +216,7 @@ void Atom::initFromOther(const Atom &other) {
} else {
dp_monomerInfo = nullptr;
}
d_flags = other.d_flags;
}
Atom::Atom(const Atom &other) : RDProps() { initFromOther(other); }

View File

@@ -382,6 +382,14 @@ class RDKIT_GRAPHMOL_EXPORT Atom : public RDProps {
return mapno;
}
//! Flags that can be used by to store information on atoms.
//! These are not serialized and should be treated as temporary values.
//! No guarantees are made about preserving these flags across library
//! calls.
void setFlags(std::uint64_t flags) { d_flags = flags; }
std::uint64_t getFlags() const { return d_flags; }
std::uint64_t &getFlags() { return d_flags; }
protected:
//! sets our owning molecule
void setOwningMol(ROMol *other);
@@ -403,6 +411,7 @@ class RDKIT_GRAPHMOL_EXPORT Atom : public RDProps {
std::uint16_t d_isotope;
atomindex_t d_index;
std::uint64_t d_flags = 0ul;
ROMol *dp_mol;
AtomMonomerInfo *dp_monomerInfo;

View File

@@ -38,6 +38,7 @@ Bond::Bond(const Bond &other) : RDProps(other) {
df_isAromatic = other.df_isAromatic;
df_isConjugated = other.df_isConjugated;
d_index = other.d_index;
d_flags = other.d_flags;
}
Bond::~Bond() { delete dp_stereoAtoms; }
@@ -61,6 +62,7 @@ Bond &Bond::operator=(const Bond &other) {
df_isConjugated = other.df_isConjugated;
d_index = other.d_index;
d_props = other.d_props;
d_flags = other.d_flags;
return *this;
}

View File

@@ -124,6 +124,7 @@ class RDKIT_GRAPHMOL_EXPORT Bond : public RDProps {
// the molecule will still be pointing to the original object
dp_mol = std::exchange(o.dp_mol, nullptr);
dp_stereoAtoms = std::exchange(o.dp_stereoAtoms, nullptr);
d_flags = std::exchange(o.d_flags, 0);
}
Bond &operator=(Bond &&o) noexcept {
if (this == &o) {
@@ -143,6 +144,7 @@ class RDKIT_GRAPHMOL_EXPORT Bond : public RDProps {
delete dp_stereoAtoms;
dp_mol = std::exchange(o.dp_mol, nullptr);
dp_stereoAtoms = std::exchange(o.dp_stereoAtoms, nullptr);
d_flags = std::exchange(o.d_flags, 0);
return *this;
}
@@ -367,6 +369,14 @@ class RDKIT_GRAPHMOL_EXPORT Bond : public RDProps {
*/
void updatePropertyCache(bool strict = true) { (void)strict; }
//! Flags that can be used by to store information on bonds.
//! These are not serialized and should be treated as temporary values.
//! No guarantees are made about preserving these flags across library
//! calls.
void setFlags(std::uint64_t flags) { d_flags = flags; }
std::uint64_t getFlags() const { return d_flags; }
std::uint64_t &getFlags() { return d_flags; }
protected:
//! sets our owning molecule
/// void setOwningMol(ROMol *other);
@@ -381,6 +391,7 @@ class RDKIT_GRAPHMOL_EXPORT Bond : public RDProps {
std::uint8_t d_bondType;
std::uint8_t d_dirTag;
std::uint8_t d_stereo;
std::uint64_t d_flags = 0;
void initBond();
};

File diff suppressed because it is too large Load Diff

View File

@@ -35,8 +35,8 @@
especially those whose name start with YY_ or yy_. They are
private implementation details that can be changed or removed. */
#ifndef YY_YYSMARTS_USR_APP_RDKIT_CODE_GRAPHMOL_SMILESPARSE_SMARTS_TAB_HPP_INCLUDED
# define YY_YYSMARTS_USR_APP_RDKIT_CODE_GRAPHMOL_SMILESPARSE_SMARTS_TAB_HPP_INCLUDED
#ifndef YY_YYSMARTS_SCRATCH_RDKIT_GIT_CODE_GRAPHMOL_SMILESPARSE_SMARTS_TAB_HPP_INCLUDED
# define YY_YYSMARTS_SCRATCH_RDKIT_GIT_CODE_GRAPHMOL_SMILESPARSE_SMARTS_TAB_HPP_INCLUDED
/* Debug traces. */
#ifndef YYDEBUG
# define YYDEBUG 0
@@ -131,4 +131,4 @@ int yysmarts_parse (const char *input, std::vector<RDKit::RWMol *> *molList, RDK
#endif
#endif /* !YY_YYSMARTS_USR_APP_RDKIT_CODE_GRAPHMOL_SMILESPARSE_SMARTS_TAB_HPP_INCLUDED */
#endif /* !YY_YYSMARTS_SCRATCH_RDKIT_GIT_CODE_GRAPHMOL_SMILESPARSE_SMARTS_TAB_HPP_INCLUDED */

View File

@@ -1,7 +1,7 @@
%{
//
// Copyright (C) 2003-2022 Greg Landrum and other RDKit contributors
// Copyright (C) 2003-2025 Greg Landrum and other RDKit contributors
//
// @@ All Rights Reserved @@
//
@@ -33,6 +33,36 @@ namespace {
molList->clear();
molList->resize(0);
}
const std::uint64_t SMARTS_H_MASK = 0x1;
const std::uint64_t SMARTS_CHARGE_MASK = 0x2;
void atom_expr_and_point_query(QueryAtom *atom_expr, QueryAtom *point_query) {
atom_expr->expandQuery(point_query->getQuery()->copy(), Queries::COMPOSITE_AND, true);
if (atom_expr->getChiralTag() == Atom::CHI_UNSPECIFIED) {
atom_expr->setChiralTag(point_query->getChiralTag());
}
if (point_query->getFlags() & SMARTS_H_MASK) {
if (!(atom_expr->getFlags() & SMARTS_H_MASK)) {
atom_expr->setNumExplicitHs(point_query->getNumExplicitHs());
atom_expr->setNoImplicit(true);
atom_expr->getFlags() |= SMARTS_H_MASK;
} else if (atom_expr->getNumExplicitHs() != point_query->getNumExplicitHs()) {
// conflicting queries...
atom_expr->setNumExplicitHs(0);
atom_expr->setNoImplicit(true);
}
}
if (point_query->getFlags() & SMARTS_CHARGE_MASK) {
if (!(atom_expr->getFlags() & SMARTS_CHARGE_MASK)) {
atom_expr->setFormalCharge(point_query->getFormalCharge());
atom_expr->getFlags() |= SMARTS_CHARGE_MASK;
} else if (atom_expr->getFormalCharge() != point_query->getFormalCharge()) {
// conflicting queries...
atom_expr->setFormalCharge(0);
}
}
}
}
void
yysmarts_error( const char *input,
@@ -415,12 +445,14 @@ hydrogen_atom: ATOM_OPEN_TOKEN H_TOKEN ATOM_CLOSE_TOKEN
| ATOM_OPEN_TOKEN H_TOKEN charge_spec ATOM_CLOSE_TOKEN {
QueryAtom *newQ = new QueryAtom(1);
newQ->setFormalCharge($3);
newQ->getFlags() |= SMARTS_CHARGE_MASK;
newQ->expandQuery(makeAtomFormalChargeQuery($3),Queries::COMPOSITE_AND,true);
$$=newQ;
}
| ATOM_OPEN_TOKEN H_TOKEN charge_spec COLON_TOKEN number ATOM_CLOSE_TOKEN {
QueryAtom *newQ = new QueryAtom(1);
newQ->setFormalCharge($3);
newQ->getFlags() |= SMARTS_CHARGE_MASK;
newQ->expandQuery(makeAtomFormalChargeQuery($3),Queries::COMPOSITE_AND,true);
newQ->setProp(RDKit::common_properties::molAtomMapNumber,$5);
@@ -430,6 +462,7 @@ hydrogen_atom: ATOM_OPEN_TOKEN H_TOKEN ATOM_CLOSE_TOKEN
QueryAtom *newQ = new QueryAtom(1);
newQ->setIsotope($2);
newQ->setFormalCharge($4);
newQ->getFlags() |= SMARTS_CHARGE_MASK;
newQ->expandQuery(makeAtomIsotopeQuery($2),Queries::COMPOSITE_AND,true);
newQ->expandQuery(makeAtomFormalChargeQuery($4),Queries::COMPOSITE_AND,true);
$$=newQ;
@@ -438,6 +471,7 @@ hydrogen_atom: ATOM_OPEN_TOKEN H_TOKEN ATOM_CLOSE_TOKEN
QueryAtom *newQ = new QueryAtom(1);
newQ->setIsotope($2);
newQ->setFormalCharge($4);
newQ->getFlags() |= SMARTS_CHARGE_MASK;
newQ->expandQuery(makeAtomIsotopeQuery($2),Queries::COMPOSITE_AND,true);
newQ->expandQuery(makeAtomFormalChargeQuery($4),Queries::COMPOSITE_AND,true);
newQ->setProp(RDKit::common_properties::molAtomMapNumber,$6);
@@ -452,6 +486,7 @@ atom_expr: atom_expr AND_TOKEN atom_expr {
if($1->getChiralTag()==Atom::CHI_UNSPECIFIED) $1->setChiralTag($3->getChiralTag());
SmilesParseOps::ClearAtomChemicalProps($1);
delete $3;
$$ = $1;
}
| atom_expr OR_TOKEN atom_expr {
$1->expandQuery($3->getQuery()->copy(),Queries::COMPOSITE_OR,true);
@@ -459,35 +494,24 @@ atom_expr: atom_expr AND_TOKEN atom_expr {
SmilesParseOps::ClearAtomChemicalProps($1);
$1->setAtomicNum(0);
delete $3;
$$ = $1;
}
| atom_expr SEMI_TOKEN atom_expr {
$1->expandQuery($3->getQuery()->copy(),Queries::COMPOSITE_AND,true);
if($1->getChiralTag()==Atom::CHI_UNSPECIFIED) $1->setChiralTag($3->getChiralTag());
SmilesParseOps::ClearAtomChemicalProps($1);
delete $3;
$$ = $1;
}
| atom_expr point_query {
$1->expandQuery($2->getQuery()->copy(),Queries::COMPOSITE_AND,true);
if($1->getChiralTag()==Atom::CHI_UNSPECIFIED) $1->setChiralTag($2->getChiralTag());
if($2->getNumExplicitHs()){
if(!$1->getNumExplicitHs()){
$1->setNumExplicitHs($2->getNumExplicitHs());
$1->setNoImplicit(true);
} else if($1->getNumExplicitHs()!=$2->getNumExplicitHs()){
// conflicting queries...
$1->setNumExplicitHs(0);
$1->setNoImplicit(false);
}
}
if($2->getFormalCharge()){
if(!$1->getFormalCharge()){
$1->setFormalCharge($2->getFormalCharge());
} else if($1->getFormalCharge()!=$2->getFormalCharge()){
// conflicting queries...
$1->setFormalCharge(0);
}
}
atom_expr_and_point_query($1, $2);
delete $2;
$$ = $1;
}
| atom_expr AND_TOKEN point_query {
atom_expr_and_point_query($1, $3);
delete $3;
$$ = $1;
}
| point_query
;
@@ -570,33 +594,41 @@ atom_query: simple_atom
| IMPLICIT_H_ATOM_QUERY_TOKEN
| COMPLEX_ATOM_QUERY_TOKEN number {
static_cast<ATOM_EQUALS_QUERY *>($1->getQuery())->setVal($2);
$$ = $1;
}
| HETERONEIGHBOR_ATOM_QUERY_TOKEN number {
$1->setQuery(makeAtomNumHeteroatomNbrsQuery($2));
$$ = $1;
}
| ALIPHATICHETERONEIGHBOR_ATOM_QUERY_TOKEN number {
$1->setQuery(makeAtomNumAliphaticHeteroatomNbrsQuery($2));
$$ = $1;
}
| RINGSIZE_ATOM_QUERY_TOKEN number {
$1->setQuery(makeAtomMinRingSizeQuery($2));
$$ = $1;
}
| RINGBOND_ATOM_QUERY_TOKEN number {
$1->setQuery(makeAtomRingBondCountQuery($2));
$$ = $1;
}
| IMPLICIT_H_ATOM_QUERY_TOKEN number {
$1->setQuery(makeAtomImplicitHCountQuery($2));
$$ = $1;
}
| possible_range_query RANGE_OPEN_TOKEN MINUS_TOKEN number RANGE_CLOSE_TOKEN {
ATOM_EQUALS_QUERY *oq = static_cast<ATOM_EQUALS_QUERY *>($1->getQuery());
ATOM_GREATEREQUAL_QUERY *nq = makeAtomSimpleQuery<ATOM_GREATEREQUAL_QUERY>($4,oq->getDataFunc(),
std::string("greater_")+oq->getDescription());
$1->setQuery(nq);
$$ = $1;
}
| possible_range_query RANGE_OPEN_TOKEN number MINUS_TOKEN RANGE_CLOSE_TOKEN {
ATOM_EQUALS_QUERY *oq = static_cast<ATOM_EQUALS_QUERY *>($1->getQuery());
ATOM_LESSEQUAL_QUERY *nq = makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>($3,oq->getDataFunc(),
std::string("less_")+oq->getDescription());
$1->setQuery(nq);
$$ = $1;
}
| possible_range_query RANGE_OPEN_TOKEN number MINUS_TOKEN number RANGE_CLOSE_TOKEN {
ATOM_EQUALS_QUERY *oq = static_cast<ATOM_EQUALS_QUERY *>($1->getQuery());
@@ -604,6 +636,7 @@ atom_query: simple_atom
oq->getDataFunc(),
std::string("range_")+oq->getDescription());
$1->setQuery(nq);
$$ = $1;
}
| number H_TOKEN {
QueryAtom *newQ = new QueryAtom();
@@ -611,6 +644,8 @@ atom_query: simple_atom
newQ->setIsotope($1);
newQ->expandQuery(makeAtomHCountQuery(1),Queries::COMPOSITE_AND,true);
newQ->setNumExplicitHs(1);
newQ->setNoImplicit(true);
newQ->getFlags() |= SMARTS_H_MASK;
$$=newQ;
}
| number H_TOKEN number {
@@ -619,24 +654,32 @@ atom_query: simple_atom
newQ->setIsotope($1);
newQ->expandQuery(makeAtomHCountQuery($3),Queries::COMPOSITE_AND,true);
newQ->setNumExplicitHs($3);
newQ->setNoImplicit(true);
newQ->getFlags() |= SMARTS_H_MASK;
$$=newQ;
}
| H_TOKEN number {
QueryAtom *newQ = new QueryAtom();
newQ->setQuery(makeAtomHCountQuery($2));
newQ->setNumExplicitHs($2);
newQ->setNoImplicit(true);
newQ->getFlags() |= SMARTS_H_MASK;
$$=newQ;
}
| H_TOKEN {
QueryAtom *newQ = new QueryAtom();
newQ->setQuery(makeAtomHCountQuery(1));
newQ->setNumExplicitHs(1);
newQ->setNoImplicit(true);
newQ->getFlags() |= SMARTS_H_MASK;
$$=newQ;
}
| charge_spec {
QueryAtom *newQ = new QueryAtom();
newQ->setQuery(makeAtomFormalChargeQuery($1));
newQ->setFormalCharge($1);
newQ->getFlags() |= SMARTS_CHARGE_MASK;
$$=newQ;
}
| AT_TOKEN AT_TOKEN {
@@ -676,18 +719,23 @@ atom_query: simple_atom
possible_range_query : COMPLEX_ATOM_QUERY_TOKEN
| HETERONEIGHBOR_ATOM_QUERY_TOKEN {
$1->setQuery(makeAtomNumHeteroatomNbrsQuery(0));
$$ = $1;
}
| ALIPHATICHETERONEIGHBOR_ATOM_QUERY_TOKEN {
$1->setQuery(makeAtomNumAliphaticHeteroatomNbrsQuery(0));
$$ = $1;
}
| RINGSIZE_ATOM_QUERY_TOKEN {
$1->setQuery(makeAtomMinRingSizeQuery(5)); // this is going to be ignored anyway
$$ = $1;
}
| RINGBOND_ATOM_QUERY_TOKEN {
$1->setQuery(makeAtomRingBondCountQuery(0));
$$ = $1;
}
| IMPLICIT_H_ATOM_QUERY_TOKEN {
$1->setQuery(makeAtomImplicitHCountQuery(0));
$$ = $1;
}
| PLUS_TOKEN {
QueryAtom *newQ = new QueryAtom();
@@ -727,14 +775,17 @@ simple_atom: ORGANIC_ATOM_TOKEN {
bond_expr:bond_expr AND_TOKEN bond_expr {
$1->expandQuery($3->getQuery()->copy(),Queries::COMPOSITE_AND,true);
delete $3;
$$ = $1;
}
| bond_expr OR_TOKEN bond_expr {
$1->expandQuery($3->getQuery()->copy(),Queries::COMPOSITE_OR,true);
delete $3;
$$ = $1;
}
| bond_expr SEMI_TOKEN bond_expr {
$1->expandQuery($3->getQuery()->copy(),Queries::COMPOSITE_AND,true);
delete $3;
$$ = $1;
}
| bond_query
;
@@ -743,6 +794,7 @@ bond_query: bondd
| bond_query bondd {
$1->expandQuery($2->getQuery()->copy(),Queries::COMPOSITE_AND,true);
delete $2;
$$ = $1;
}
;

View File

@@ -40,4 +40,29 @@ TEST_CASE("Github #8424: direction on aromatic bonds in SMARTS") {
CHECK(smarts.find("/") != std::string::npos);
CHECK(smarts.find("\\") != std::string::npos);
}
}
TEST_CASE("repeated explicit H counts and charges") {
SECTION("h counts") {
std::vector<std::string> smartses = {
"[N&H3&H0]",
"[N&H0&H3]",
};
for (const auto &smarts : smartses) {
INFO(smarts);
auto m = v2::SmilesParse::MolFromSmarts(smarts);
REQUIRE(m);
CHECK(m->getAtomWithIdx(0)->getNoImplicit());
CHECK(m->getAtomWithIdx(0)->getNumExplicitHs() == 0);
}
}
SECTION("charges") {
std::vector<std::string> smartses = {"[N&+&+0]", "[N&+0&+2]"};
for (const auto &smarts : smartses) {
INFO(smarts);
auto m = v2::SmilesParse::MolFromSmarts(smarts);
REQUIRE(m);
CHECK(m->getAtomWithIdx(0)->getFormalCharge() == 0);
}
}
}

View File

@@ -2338,7 +2338,7 @@ void testChargesAndIsotopes() {
{
std::unique_ptr<ROMol> p(
SmartsToMol("[12C][12#6][12C+][12C+1][C+][C+1][12][+][C][#6][12CH2]["
"12CH3+][CH4+][14N@H+]"));
"12CH3+][CH4+][14N@H+][C&+][12C&+]"));
TEST_ASSERT(p);
TEST_ASSERT(p->getAtomWithIdx(0)->getFormalCharge() == 0);
TEST_ASSERT(p->getAtomWithIdx(1)->getFormalCharge() == 0);
@@ -2354,6 +2354,8 @@ void testChargesAndIsotopes() {
TEST_ASSERT(p->getAtomWithIdx(11)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(12)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(13)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(14)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(15)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(0)->getIsotope() == 12);
TEST_ASSERT(p->getAtomWithIdx(1)->getIsotope() == 12);
@@ -2369,6 +2371,8 @@ void testChargesAndIsotopes() {
TEST_ASSERT(p->getAtomWithIdx(11)->getIsotope() == 12);
TEST_ASSERT(p->getAtomWithIdx(12)->getIsotope() == 0);
TEST_ASSERT(p->getAtomWithIdx(13)->getIsotope() == 14);
TEST_ASSERT(p->getAtomWithIdx(14)->getIsotope() == 0);
TEST_ASSERT(p->getAtomWithIdx(15)->getIsotope() == 12);
TEST_ASSERT(p->getAtomWithIdx(9)->getNumExplicitHs() == 0);
TEST_ASSERT(p->getAtomWithIdx(10)->getNumExplicitHs() == 2);
@@ -2397,6 +2401,8 @@ void testChargesAndIsotopes() {
TEST_ASSERT(p->getAtomWithIdx(11)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(12)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(13)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(14)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(15)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(0)->getIsotope() == 12);
TEST_ASSERT(p->getAtomWithIdx(1)->getIsotope() == 12);
@@ -2412,6 +2418,8 @@ void testChargesAndIsotopes() {
TEST_ASSERT(p->getAtomWithIdx(11)->getIsotope() == 12);
TEST_ASSERT(p->getAtomWithIdx(12)->getIsotope() == 0);
TEST_ASSERT(p->getAtomWithIdx(13)->getIsotope() == 14);
TEST_ASSERT(p->getAtomWithIdx(14)->getIsotope() == 0);
TEST_ASSERT(p->getAtomWithIdx(15)->getIsotope() == 12);
// p->debugMol(std::cerr);
TEST_ASSERT(p->getAtomWithIdx(9)->getNumExplicitHs() == 0);
@@ -2431,15 +2439,18 @@ void testChargesAndIsotopes() {
TEST_ASSERT(p->getAtomWithIdx(0)->getIsotope() == 0);
TEST_ASSERT(p->getAtomWithIdx(1)->getIsotope() == 0);
TEST_ASSERT(p->getAtomWithIdx(2)->getIsotope() == 0);
TEST_ASSERT(p->getAtomWithIdx(3)->getIsotope() == 0);
TEST_ASSERT(p->getAtomWithIdx(3)->getIsotope() == 12);
TEST_ASSERT(p->getAtomWithIdx(8)->getIsotope() == 0);
TEST_ASSERT(p->getAtomWithIdx(3)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(4)->getFormalCharge() == 0);
TEST_ASSERT(p->getAtomWithIdx(5)->getFormalCharge() == 0);
TEST_ASSERT(p->getAtomWithIdx(6)->getFormalCharge() == 0);
TEST_ASSERT(p->getAtomWithIdx(6)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(7)->getFormalCharge() == 0);
TEST_ASSERT(p->getAtomWithIdx(8)->getFormalCharge() == 1);
TEST_ASSERT(p->getAtomWithIdx(0)->getNumExplicitHs() == 0);
TEST_ASSERT(p->getAtomWithIdx(6)->getNumExplicitHs() == 1);
TEST_ASSERT(p->getAtomWithIdx(8)->getNumExplicitHs() == 1);
}
BOOST_LOG(rdInfoLog) << "done" << std::endl;

View File

@@ -9,6 +9,8 @@ GitHub)
## Highlights
## Backwards incompatible changes:
- Simple AND queries are now merged into atoms. E.g. `[C&+]` now produces the
the same result as `[C+]` when parsed as SMARTS.
## New Features and Enhancements: