RDKit
Open-source cheminformatics and machine learning.
RGroupDecompData.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2017 Novartis Institutes for BioMedical Research
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef RGROUP_DECOMP_DATA
11 #define RGROUP_DECOMP_DATA
12 
13 #include "RGroupCore.h"
14 #include "RGroupDecomp.h"
15 #include "RGroupMatch.h"
16 #include "RGroupScore.h"
17 #include <vector>
18 #include <map>
19 
20 namespace RDKit
21 {
23  // matches[mol_idx] == vector of potential matches
24  std::map<int, RCore> cores;
25  std::map<std::string, int> newCores; // new "cores" found along the way
28 
29  std::vector<std::vector<RGroupMatch>> matches;
30  std::set<int> labels;
31  std::vector<size_t> permutation;
32  std::map<int, std::vector<int>> userLabels;
33 
34  std::vector<int> processedRlabels;
35 
36  std::map<int, int> finalRlabelMapping;
37 
38  RGroupDecompData(const RWMol &inputCore,
40  : params(std::move(inputParams)) {
41  cores[0] = RCore(inputCore);
42  prepareCores();
43  }
44 
45  RGroupDecompData(const std::vector<ROMOL_SPTR> &inputCores,
47  : params(std::move(inputParams)) {
48  for (size_t i = 0; i < inputCores.size(); ++i) {
49  cores[i] = RCore(*inputCores[i]);
50  }
51 
52  prepareCores();
53  }
54 
55  void prepareCores() {
56  for (auto &core : cores) {
57  RWMol *alignCore = core.first ? cores[0].core.get() : nullptr;
58  CHECK_INVARIANT(params.prepareCore(*core.second.core, alignCore),
59  "Could not prepare at least one core");
61  core.second.findIndicesWithRLabel();
62  }
63  core.second.labelledCore.reset(new RWMol(*core.second.core));
64  }
65  }
66 
67  void setRlabel(Atom *atom, int rlabel) {
68  PRECONDITION(rlabel != 0, "RLabels must be >0");
70  atom->setAtomMapNum(rlabel);
71  }
72 
74  std::string dLabel = "R" + std::to_string(rlabel);
76  setAtomRLabel(atom, rlabel);
77  }
78 
80  atom->setIsotope(rlabel + 1);
81  }
82  }
83 
84  void prune() { // prune all but the current "best" permutation of matches
85  for (size_t mol_idx = 0; mol_idx < permutation.size(); ++mol_idx) {
86  std::vector<RGroupMatch> keepVector;
87  keepVector.push_back(matches[mol_idx][permutation[mol_idx]]);
88  matches[mol_idx] = keepVector;
89  }
90  permutation = std::vector<size_t>(matches.size(), 0);
91  }
92 
93  // Return the RGroups with the current "best" permutation
94  // of matches.
95  std::vector<RGroupMatch> GetCurrentBestPermutation() const {
96  const bool removeAllHydrogenRGroups = params.removeAllHydrogenRGroups;
97 
98  std::vector<RGroupMatch> results; // std::map<int, RGroup> > result;
99  for (size_t i = 0; i < permutation.size(); ++i) {
100  CHECK_INVARIANT(i < matches.size(),
101  "Best Permutation mol idx out of range");
102  CHECK_INVARIANT(permutation[i] < matches[i].size(),
103  "Selected match at permutation out of range");
104  results.push_back(matches[i][permutation[i]]);
105  }
106 
107  if (removeAllHydrogenRGroups) {
108  // if a label is all hydrogens, remove it
109 
110  // This logic is a bit tricky, find all labels that have common cores
111  // and analyze those sets independently.
112  // i.e. if core 1 doesn't have R1 then don't analyze it in when looking
113  // at label 1
114  std::map<int, std::set<int>> labelCores; // map from label->cores
115  for (auto &position : results) {
116  int core_idx = position.core_idx;
117  if (labelCores.find(core_idx) == labelCores.end()) {
118  auto core = cores.find(core_idx);
119  if (core != cores.end()) {
120  for (auto rlabels : getRlabels(*core->second.core)) {
121  int rlabel = rlabels.first;
122  labelCores[rlabel].insert(core_idx);
123  }
124  }
125  }
126  }
127 
128  for (int label : labels) {
129  bool allH = true;
130  for (auto &position : results) {
131  R_DECOMP::const_iterator rgroup = position.rgroups.find(label);
132  bool labelHasCore = labelCores[label].find(position.core_idx) !=
133  labelCores[label].end();
134  if (labelHasCore && (rgroup == position.rgroups.end() ||
135  !rgroup->second->is_hydrogen)) {
136  allH = false;
137  break;
138  }
139  }
140 
141  if (allH) {
142  for (auto &position : results) {
143  position.rgroups.erase(label);
144  }
145  }
146  }
147  }
148  return results;
149  }
150 
151  class UsedLabels {
152  public:
153  std::set<int> labels_used;
154  bool add(int rlabel) {
155  if (labels_used.find(rlabel) != labels_used.end()) {
156  return false;
157  }
158  labels_used.insert(rlabel);
159  return true;
160  }
161 
162  int next() {
163  int i = 1;
164  while (labels_used.find(i) != labels_used.end()) {
165  ++i;
166  }
167  labels_used.insert(i);
168  return i;
169  }
170  };
171 
172  void relabelCore(RWMol &core, std::map<int, int> &mappings,
173  UsedLabels &used_labels, const std::set<int> &indexLabels,
174  std::map<int, std::vector<int>> extraAtomRLabels) {
175  // Now remap to proper rlabel ids
176  // if labels are positive, they come from User labels
177  // if they are negative, they come from indices and should be
178  // numbered *after* the user labels.
179  //
180  // Some indices are attached to multiple bonds,
181  // these rlabels should be incrementally added last
182  std::map<int, Atom *> atoms = getRlabels(core);
183  // a core only has one labelled index
184  // a secondary structure extraAtomRLabels contains the number
185  // of bonds between this atom and the side chain
186 
187  // a sidechain atom has a vector of the attachments back to the
188  // core that takes the place of numBondsToRlabel
189 
190  std::map<int, std::vector<int>> bondsToCore;
191  std::vector<std::pair<Atom *, Atom *>> atomsToAdd; // adds -R if necessary
192 
193  // Deal with user supplied labels
194  for (auto rlabels : atoms) {
195  int userLabel = rlabels.first;
196  if (userLabel < 0) {
197  continue; // not a user specified label
198  }
199  Atom *atom = rlabels.second;
200  mappings[userLabel] = userLabel;
201  used_labels.add(userLabel);
202 
203  if (atom->getAtomicNum() == 0) { // add to existing dummy/rlabel
204  setRlabel(atom, userLabel);
205  } else { // adds new rlabel
206  auto *newAt = new Atom(0);
207  setRlabel(newAt, userLabel);
208  atomsToAdd.emplace_back(atom, newAt);
209  }
210  }
211 
212  // Deal with non-user supplied labels
213  for (auto newLabel : indexLabels) {
214  auto atm = atoms.find(newLabel);
215  if (atm == atoms.end()) {
216  continue;
217  }
218 
219  Atom *atom = atm->second;
220 
221  int rlabel;
222  auto mapping = mappings.find(newLabel);
223  if (mapping == mappings.end()) {
224  rlabel = used_labels.next();
225  mappings[newLabel] = rlabel;
226  } else {
227  rlabel = mapping->second;
228  }
229 
230  if (atom->getAtomicNum() == 0) { // add to dummy
231  setRlabel(atom, rlabel);
232  } else {
233  auto *newAt = new Atom(0);
234  setRlabel(newAt, rlabel);
235  atomsToAdd.emplace_back(atom, newAt);
236  }
237  }
238 
239  // Deal with multiple bonds to the same label
240  for (auto &extraAtomRLabel : extraAtomRLabels) {
241  auto atm = atoms.find(extraAtomRLabel.first);
242  if (atm == atoms.end()) {
243  continue; // label not used in the rgroup
244  }
245  Atom *atom = atm->second;
246 
247  for (int &i : extraAtomRLabel.second) {
248  int rlabel = used_labels.next();
249  i = rlabel;
250  // Is this necessary?
252  atom->getAtomicNum() > 1,
253  "Multiple attachments to a dummy (or hydrogen) is weird.");
254  auto *newAt = new Atom(0);
255  setRlabel(newAt, rlabel);
256  atomsToAdd.emplace_back(atom, newAt);
257  }
258  }
259 
260  for (auto &i : atomsToAdd) {
261  core.addAtom(i.second, false, true);
262  core.addBond(i.first, i.second, Bond::SINGLE);
263  }
264  core.updatePropertyCache(false); // this was github #1550
265  }
266 
267  void relabelRGroup(RGroupData &rgroup, const std::map<int, int> &mappings) {
268  PRECONDITION(rgroup.combinedMol.get(), "Unprocessed rgroup");
269 
270  RWMol &mol = *rgroup.combinedMol.get();
271  if (rgroup.combinedMol->hasProp(done)) {
272  rgroup.labelled = true;
273  return;
274  }
275 
276  mol.setProp(done, true);
277  std::vector<std::pair<Atom *, Atom *>> atomsToAdd; // adds -R if necessary
278 
279  for (RWMol::AtomIterator atIt = mol.beginAtoms(); atIt != mol.endAtoms();
280  ++atIt) {
281  Atom *atom = *atIt;
282  if (atom->hasProp(SIDECHAIN_RLABELS)) {
283  atom->setIsotope(0);
284  const std::vector<int> &rlabels =
285  atom->getProp<std::vector<int>>(SIDECHAIN_RLABELS);
286  // switch on atom mappings or rlabels....
287 
288  for (int rlabel : rlabels) {
289  auto label = mappings.find(rlabel);
290  CHECK_INVARIANT(label != mappings.end(), "Unprocessed mapping");
291 
292  if (atom->getAtomicNum() == 0) {
293  setRlabel(atom, label->second);
294  } else {
295  auto *newAt = new Atom(0);
296  setRlabel(newAt, label->second);
297  atomsToAdd.emplace_back(atom, newAt);
298  }
299  }
300  }
301  }
302 
303  for (auto &i : atomsToAdd) {
304  mol.addAtom(i.second, false, true);
305  mol.addBond(i.first, i.second, Bond::SINGLE);
306  }
307 
309  RDLog::BlockLogs blocker;
310  bool implicitOnly = false;
311  bool updateExplicitCount = false;
312  bool sanitize = false;
313  MolOps::removeHs(mol, implicitOnly, updateExplicitCount, sanitize);
314  }
315 
316  mol.updatePropertyCache(false); // this was github #1550
317  rgroup.labelled = true;
318  }
319 
320  // relabel the core and sidechains using the specified user labels
321  // if matches exist for non labelled atoms, these are added as well
322  void relabel() {
323  std::vector<RGroupMatch> best = GetCurrentBestPermutation();
324 
325  // get the labels used
326  std::set<int> userLabels;
327  std::set<int> indexLabels;
328 
329  // Go through all the RGroups and find out which labels were
330  // actually used.
331 
332  // some atoms will have multiple attachment points, i.e. cycles
333  // split these up into new rlabels if necessary
334  // These are detected at match time
335  // This vector will hold the extra (new) labels required
336  std::map<int, std::vector<int>> extraAtomRLabels;
337 
338  for (auto &it : best) {
339  for (auto &rgroup : it.rgroups) {
340  if (rgroup.first >= 0) {
341  userLabels.insert(rgroup.first);
342  }
343  if (rgroup.first < 0) {
344  indexLabels.insert(rgroup.first);
345  }
346 
347  std::map<int, int> rlabelsUsedInRGroup =
348  rgroup.second->getNumBondsToRlabels();
349  for (auto &numBondsUsed : rlabelsUsedInRGroup) {
350  // Make space for the extra labels
351  if (numBondsUsed.second > 1) { // multiple rgroup bonds to same atom
352  extraAtomRLabels[numBondsUsed.first].resize(numBondsUsed.second -
353  1);
354  }
355  }
356  }
357  }
358 
359  // Assign final RGroup labels to the cores and propagate these to
360  // the scaffold
361  finalRlabelMapping.clear();
362 
363  UsedLabels used_labels;
364  for (auto &core : cores) {
365  core.second.labelledCore.reset(new RWMol(*core.second.core));
366  relabelCore(*core.second.labelledCore, finalRlabelMapping, used_labels,
367  indexLabels, extraAtomRLabels);
368  }
369 
370  for (auto &it : best) {
371  for (auto &rgroup : it.rgroups) {
372  relabelRGroup(*rgroup.second, finalRlabelMapping);
373  }
374  }
375  }
376 
377  // compute the number of rgroups that would be added if we
378  // accepted this permutation
380  const std::vector<size_t> &tied_permutation,
381  std::vector<int> &heavy_counts) {
382  size_t i = 0;
383  size_t num_added_rgroups = 0;
384  heavy_counts.resize(labels.size(), 0);
385  for (int label : labels) {
386  int incr = (label > 0) ? -1 : 1;
387  bool incremented = false;
388  for (size_t m = 0; m < tied_permutation.size();
389  ++m) { // for each molecule
390  auto rg = matches[m][tied_permutation[m]].rgroups.find(label);
391  if (rg != matches[m][tied_permutation[m]].rgroups.end() &&
392  !rg->second->is_hydrogen) {
393  if (label <= 0 && !incremented) {
394  incremented = true;
395  ++num_added_rgroups;
396  }
397  heavy_counts[i] += incr;
398  }
399  }
400  ++i;
401  }
402  return num_added_rgroups;
403  }
404 
405  bool process(bool pruneMatches, bool finalize = false) {
406  if (matches.size() == 0) {
407  return false;
408  }
409  auto t0 = std::chrono::steady_clock::now();
410  // Exhaustive search, get the MxN matrix
411  // (M = matches.size(): number of molecules
412  // N = iterator.maxPermutations)
413  std::vector<size_t> permutations;
414 
415  std::transform(matches.begin(), matches.end(),
416  std::back_inserter(permutations),
417  [](const std::vector<RGroupMatch> &m) { return m.size(); });
418  permutation = std::vector<size_t>(permutations.size(), 0);
419 
420  // run through all possible matches and score each
421  // set
422  double best_score = 0;
423  std::vector<size_t> best_permutation = permutation;
424  std::vector<std::vector<size_t>> ties;
425 
426  size_t count = 0;
427 #ifdef DEBUG
428  std::cerr << "Processing" << std::endl;
429 #endif
430  CartesianProduct iterator(permutations);
431  // Iterates through the permutation idx, i.e.
432  // [m1_permutation_idx, m2_permutation_idx, m3_permutation_idx]
433 
434  while (iterator.next()) {
435  if (count > iterator.maxPermutations) {
436  throw ValueErrorException("next() did not finish");
437  }
438 #ifdef DEBUG
439  std::cerr << "**************************************************"
440  << std::endl;
441 #endif
442  double newscore = score(iterator.permutation, matches, labels);
443 
444  if (fabs(newscore - best_score) <
445  1e-6) { // heuristic to overcome floating point comparison issues
446  ties.push_back(iterator.permutation);
447  } else if (newscore > best_score) {
448 #ifdef DEBUG
449  std::cerr << " ===> current best:" << newscore << ">" << best_score
450  << std::endl;
451 #endif
452  ties.clear();
453  ties.push_back(iterator.permutation);
454  best_score = newscore;
455  best_permutation = iterator.permutation;
456  }
457  ++count;
458  }
459 
460  if (ties.size() > 1) {
461  size_t min_perm_value = 0;
462  size_t smallest_added_rgroups = labels.size();
463  for (const auto &tied_permutation : ties) {
464  std::vector<int> heavy_counts;
465  std::vector<int> largest_heavy_counts(labels.size(), 0);
466  size_t num_added_rgroups =
467  compute_num_added_rgroups(tied_permutation, heavy_counts);
468  if (num_added_rgroups < smallest_added_rgroups) {
469  smallest_added_rgroups = num_added_rgroups;
470  best_permutation = tied_permutation;
471  } else if (num_added_rgroups == smallest_added_rgroups) {
472  if (heavy_counts < largest_heavy_counts) {
473  largest_heavy_counts = heavy_counts;
474  best_permutation = tied_permutation;
475  } else if (heavy_counts == largest_heavy_counts) {
476  size_t perm_value = iterator.value();
477  if (perm_value < min_perm_value) {
478  min_perm_value = perm_value;
479  best_permutation = tied_permutation;
480  }
481  }
482  }
484  }
485  }
486  permutation = best_permutation;
487  if (pruneMatches || finalize) {
488  prune();
489  }
490 
491  if (finalize) {
492  relabel();
493  }
494 
495  return true;
496  }
497 };
498 }
499 
500 #endif
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:101
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
The class for representing atoms.
Definition: Atom.h:69
void setIsotope(unsigned int what)
sets our isotope number
int getAtomicNum() const
returns our atomic number
Definition: Atom.h:116
void setAtomMapNum(int mapno, bool strict=true)
Set the atom map Number of the atom.
Definition: Atom.h:366
@ SINGLE
Definition: Bond.h:58
void getProp(const std::string &key, T &res) const
allows retrieval of a particular property value
Definition: RDProps.h:102
bool hasProp(const std::string &key) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: RDProps.h:121
void setProp(const std::string &key, T val, bool computed=false) const
sets a property value
Definition: RDProps.h:72
AtomIterator endAtoms()
get an AtomIterator pointing at the end of our Atoms
void updatePropertyCache(bool strict=true)
calculates any of our lazy properties
AtomIterator beginAtoms()
get an AtomIterator pointing at our first Atom
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:31
unsigned int addAtom(bool updateLabel=true)
adds an empty Atom to our collection
unsigned int addBond(unsigned int beginAtomIdx, unsigned int endAtomIdx, Bond::BondType order=Bond::UNSPECIFIED)
adds a Bond between the indicated Atoms
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:39
static std::string to_string(const Descriptor &desc)
Definition: Descriptor.h:54
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
RDKIT_RDGENERAL_EXPORT const std::string dummyLabel
Std stuff.
Definition: Abbreviations.h:17
std::map< int, Atom * > getRlabels(const RWMol &mol)
Get the RLabels,atom mapping for the current molecule.
RDKIT_GRAPHMOL_EXPORT void setAtomRLabel(Atom *atm, int rlabel)
Set the atom's MDL integer RLabel.
const std::string done
@ MDLRGroup
Definition: RGroupDecomp.h:49
@ AtomMap
Definition: RGroupDecomp.h:47
@ Isotope
Definition: RGroupDecomp.h:48
double score(const std::vector< size_t > &permutation, const std::vector< std::vector< RGroupMatch >> &matches, const std::set< int > &labels)
bool checkForTimeout(const std::chrono::steady_clock::time_point &t0, double timeout, bool throwOnTimeout=true)
Definition: RGroupDecomp.h:129
const std::string SIDECHAIN_RLABELS
const unsigned int EMPTY_CORE_LABEL
Definition: RGroupUtils.h:20
iterate through all possible permutations of the rgroups
Definition: RGroupScore.h:19
std::vector< size_t > permutation
Definition: RGroupScore.h:20
RCore is the core common to a series of molecules.
Definition: RGroupCore.h:19
A single rgroup attached to a given core.
Definition: RGroupData.h:27
boost::shared_ptr< RWMol > combinedMol
Definition: RGroupData.h:28
size_t compute_num_added_rgroups(const std::vector< size_t > &tied_permutation, std::vector< int > &heavy_counts)
std::vector< std::vector< RGroupMatch > > matches
RGroupDecompData(const RWMol &inputCore, RGroupDecompositionParameters inputParams)
void relabelRGroup(RGroupData &rgroup, const std::map< int, int > &mappings)
std::vector< size_t > permutation
std::map< int, std::vector< int > > userLabels
RGroupDecompositionParameters params
std::map< std::string, int > newCores
std::map< int, RCore > cores
void setRlabel(Atom *atom, int rlabel)
void relabelCore(RWMol &core, std::map< int, int > &mappings, UsedLabels &used_labels, const std::set< int > &indexLabels, std::map< int, std::vector< int >> extraAtomRLabels)
std::vector< int > processedRlabels
std::map< int, int > finalRlabelMapping
RGroupDecompData(const std::vector< ROMOL_SPTR > &inputCores, RGroupDecompositionParameters inputParams)
std::vector< RGroupMatch > GetCurrentBestPermutation() const
bool process(bool pruneMatches, bool finalize=false)
double timeout
timeout in seconds. <=0 indicates no timeout
Definition: RGroupDecomp.h:69
bool prepareCore(RWMol &, const RWMol *alignCore)