On 11/10/2010 20:21, Noel O'Boyle wrote:
I went through a large dataset of PubChem 3D structures looking for
implicit H failures (removing, then adding Hs). 1/3 of the failures
are due to the following in atomtyp.txt:

INTHYB  [$([#6]([#8D1])[#8D1])]   2       #sp2 carbon

This makes any C attached to two Os turn into sp2...even in geminal
diols , for example, ClC(Cl)(Cl)C(O)O. According to wikipedia, these
don't tend to last long but they are in PubChem (whether errors or
not, I can't say).

If it's commented out, then it works fine for these cases.

Is there any reason for this rule (it seems to date from the early
days)? Perhaps it's to correct ligand structures from the PDB where
all examples of this indicate COO-? If so, maybe the PDB cases are
better handled in the code using the molecular geometry...?

I have always regarded the implicit valency model as unsatisfactory, and maybe it has become a bit messed up over the years. Is it currently used for anything other than recognizing when a hydrogen could or should be added to an atom?

A simpler and more obvious model for this purpose has essentially a single IMPVAL for each charge state of the molecule. (Only if you are interested in radicals or hydrogen on the higher valency states of second row elements do you need another rule for each higher valence.) There is no need for any skilful fine tuning. It is more maintainable and will be faster because not so many SMARTS patterns need to be matched. Up to now, It has worked for everything I've tried (although this not very extensive), except with test_formula, where the fault is in a couple of erroneous results in formularesults.txt, which at least shows the old model was error-prone and needs some more tweaking of phosphate structures.

There may be other side effects I'm not aware of. Just before a release is not a good time to commit something like this (5 years ago would have been better), so I've just attached a patch (changes to 11 code lines), if you want to try it.

Chris
Index: data/atomtyp.txt
===================================================================
--- data/atomtyp.txt    (revision 4157)
+++ data/atomtyp.txt    (working copy)
@@ -77,105 +77,63 @@
 #
 #IMPLICIT VALENCE RULES
 #
-#IMPVAL  [#3,#11,#19,#37,#55,#87]  1 # Alkali metals
-#IMPVAL  [#4,#12,#20,#38,#56,#88]  2 # Alkaline earth
-
 IMPVAL  [#5]                      3 # Generic boron
-IMPVAL [$([#6+]=*)]                      2
-IMPVAL [$([#6+]#*)]                      1 # just for InChI C2
 
-IMPVAL  [#6^3]                    4
-IMPVAL  [#6^2]                    3
-IMPVAL  [#6^1]                    2
-IMPVAL  [#6^3-]                   3
-IMPVAL  [#6^2-]                   2
-IMPVAL  [#6^1-]                   1 # isocyanides, CO (valence=1, bond order=3)
-IMPVAL  [c+]                      2
+IMPVAL  [#6]                      4 # Generic carbon
+IMPVAL  [#6-]                     3
+IMPVAL  [#6+]                     3
 
-IMPVAL  [N^3]                     3
-IMPVAL  [N^3+]                    4
-IMPVAL  [N^2]                     3
-#IMPVAL  [N^2-,N^3-]              2
-IMPVAL  [ND1^1]                   1
-IMPVAL  [ND2^1]                   2
-IMPVAL  [$(N([OD1])*)]            3
-IMPVAL  [$(NC=O)]                 3
-#IMPVAL  [$(n(c=O)c=O)]           3 # aromatic diimide
-IMPVAL  [$(N(~[OD1])~[OD1])]      3
-IMPVAL  [$([Nr0]C=[Nr0])]         3
-IMPVAL  [$(N=*)]                  2
-#IMPVAL [$([Nr0]=C[Nr0])]         3
-IMPVAL  [$([Nr0]=C[Nr0])]         2
-IMPVAL  [$([N+r0]=C[Nr0])]        3 # patch from Steve Constable
-IMPVAL  [$([#7D1-]=*)]            1
-IMPVAL  [N^1-]                    1 # N2O [N-]=[N+]=O (valence=1, bondorder=2)
-IMPVAL  [N-]                      2
-IMPVAL  [n]                       2
-IMPVAL  [n-]                      2
-IMPVAL  [n+]                      3
-IMPVAL  [$([#7-]=*)]              1
-IMPVAL  [$([#7+]#*)]              2
+IMPVAL  [#7]                      3 # Generic nitrogen
+IMPVAL  [#7+]                     4
+IMPVAL  [#7-]                     2
 
-IMPVAL [$([#8+]#*)] 1
+IMPVAL  [#8]                      2 # Generic oxygen
+IMPVAL  [#8-]                     1
+IMPVAL  [#8-2]                    0
+IMPVAL  [#8+]                     3
 
-IMPVAL [#8]                       2 # Generic O
-IMPVAL [#8-]                      1
-IMPVAL [#8-2]                     0
-IMPVAL [#8+]                      3
-IMPVAL [$([#8]=*)]                1
-IMPVAL [#8^1+]                    1 # carbon monoxide
+IMPVAL  [#14]                     4 # Generic silicon
+IMPVAL  [#14+]                    3
+IMPVAL  [#14-]                    3
 
-IMPVAL  [#9D0]                    1
-IMPVAL  [#9D0-]                   0 #[F-]
+IMPVAL  [#16]                     2 # Generic sulfur
+IMPVAL  [#16-]                    1
+IMPVAL  [#16-2]                   0
+IMPVAL  [#16+]                    3
+IMPVAL  [#16v3]                   4 # Higher valence S
+IMPVAL  [#16v5]                   6
 
-IMPVAL  [#13]                     3 # generic Al
+IMPVAL  [#17]                     1
+IMPVAL  [#17-]                    0
+IMPVAL  [#17v2]                   3 # Higher valence Cl
+IMPVAL  [#17v4]                   5
+IMPVAL  [#17v6]                   7
 
-IMPVAL  [#14]                     4 # regular sp3 Si
-IMPVAL  [#14^2]                   2 # silylenes
-IMPVAL  [$([#14]=*)]              3 # e.g., Si=O 
-IMPVAL  [$([#14](=*)=*)]          0 # e.g., Si(=O)=O
-IMPVAL  [$([#14+]#*)]                    1 # just for InChI Si2
-IMPVAL  [$([#14-]#*)]                    1 # just for InChI Si2
-
-IMPVAL  [#15D4]                   4
-IMPVAL  [#15D3]                   3
-IMPVAL  [#15D2]                   3
-IMPVAL  [#15D1^3]                 3
-IMPVAL  [#15D1^2]                 3
-IMPVAL  [#15D0]                   3
-IMPVAL  [$([#15]=[#6])]           2
-IMPVAL  [pD2]                     2
-IMPVAL  [$([#15]=[#8])]           4 # phosphinate (double bond = one valence)
-IMPVAL  [$([#15+](=[#8])([#8]))]      0 # R-P+O2
-
-IMPVAL  [#16]                     2 #revised CM April 2008
-IMPVAL  [$([#16D1]=*)]            1 
-IMPVAL  [$([#16D3](=O)(=O)-O)]    4 #e.g. O[S](=O)=O
-IMPVAL  [$([#16D2](=O)-O)]        3 # e.g. O[S]=O
-IMPVAL  [#16D1-]                  1
-
-IMPVAL  [#17D0]                   1
-IMPVAL  [#17D0-]                  0 #[Cl-]
-
 IMPVAL  [#31]                     3
 IMPVAL  [#32]                     4
 
 IMPVAL  [#33]                     3
-IMPVAL  [as]                      3
 
 IMPVAL  [#34]                     2
 
-IMPVAL  [#35D0]                   1
-IMPVAL  [#35D0-]                  0 #[Br-]
+IMPVAL  [#35]                     1
+IMPVAL  [#35-]                    0 #[Br-]
+IMPVAL  [#35v2]                   3 # Higher valence Br
+IMPVAL  [#35v4]                   5
+IMPVAL  [#35v6]                   7
 
 #IMPVAL  [#49,#81]                 3 # In, Tl, like Ga
 #IMPVAL  [#50,#82]                 4 # Sn, Pb, like Ge
 #IMPVAL  [#51,#83]                 3 # Sb, Bi, like As
 
 IMPVAL  [#52]                     2
-IMPVAL  [#53D0]                   1
-IMPVAL  [#53D0-]                  0 #[I-]
 
+IMPVAL  [#53]                     1
+IMPVAL  [#53-]                    0 #[I-]
+IMPVAL  [#53v2]                   3 # Higher valence I
+IMPVAL  [#53v4]                   5
+IMPVAL  [#53v6]                   7
+
 #
 #EXTERNAL TYPE RULES
 #
Index: src/atom.cpp
===================================================================
--- src/atom.cpp        (revision 4157)
+++ src/atom.cpp        (working copy)
@@ -968,7 +968,7 @@
     unsigned int bosum;
     OBBondIterator i;
 
-    bosum = GetImplicitValence();
+    bosum = GetValence() + ImplicitHydrogenCount();
 
     for (bond = ((OBAtom*)this)->BeginBond(i);bond;bond = 
((OBAtom*)this)->NextBond(i))
       {
@@ -989,7 +989,7 @@
       atomtyper.AssignImplicitValence(*((OBMol*)((OBAtom*)this)->GetParent()));
 
     // _impval is assigned by the atomtyper -- same as calling 
GetImplicitValence()
-    int impval = _impval - GetValence();
+    int impval = _impval - BOSum();
 
     // we need to modify this implicit valence if we're a radical center
     int mult = GetSpinMultiplicity();
Index: src/graphsym.cpp
===================================================================
--- src/graphsym.cpp    (revision 4157)
+++ src/graphsym.cpp    (working copy)
@@ -174,7 +174,7 @@
         else if (bond->IsAromatic()) count += 1.6f;
       }
     }
-    if (atom->GetAtomicNum() == 7 && atom->IsAromatic() && 
atom->GetImplicitValence() == 3) {
+    if (atom->GetAtomicNum() == 7 && atom->IsAromatic() && 
atom->GetValence()+atom->ImplicitHydrogenCount() == 3) {
       count += 1;         // [nH] - add another bond
     }
     return(int(count + 0.5));     // round to nearest int
Index: src/mol.cpp
===================================================================
--- src/mol.cpp (revision 4157)
+++ src/mol.cpp (working copy)
@@ -2037,7 +2037,7 @@
                            atom->IsSulfur() || atom->IsPhosphorus()))
           continue;
 
-        hcount = atom->GetImplicitValence() - atom->GetValence();
+        hcount = atom->GetImplicitValence() - atom->BOSum();
 
         //Jan 05 Implicit valency now left alone; use spin multiplicity for 
implicit Hs
         int mult = atom->GetSpinMultiplicity();
@@ -2191,7 +2191,7 @@
     int hcount,count=0;
     vector<pair<OBAtom*,int> > vhadd;
 
-    hcount = atom->GetImplicitValence() - atom->GetValence();
+    hcount = atom->GetImplicitValence() - atom->BOSum();
 
     //Jan 05 Implicit valency now left alone; use spin multiplicity for 
implicit Hs
     int mult = atom->GetSpinMultiplicity();
@@ -2347,7 +2347,7 @@
             || (!atom->IsHydrogen() && 
atom->ExplicitHydrogenCount(true)!=0)//exclude D,T
             || atom->HasNoHForced())
           {
-            diff=atom->GetImplicitValence() - (atom->GetHvyValence() + 
atom->ExplicitHydrogenCount());
+            diff=atom->GetImplicitValence() - atom->BOSum();
             if (diff)
               atom->SetSpinMultiplicity(diff+1);//radicals =2; all carbenes =3
           }
@@ -2555,7 +2555,7 @@
           }
 
         if (secondpass && atom->IsNitrogen() && atom->GetFormalCharge() == 0 &&
-            atom->GetImplicitValence() == 2) //try protonating the nitrogen
+          atom->GetValence() + atom->ImplicitHydrogenCount() == 2) //try 
protonating the nitrogen
           {
             atom->IncrementImplicitValence();
             if (ExpandKekule(mol,va,i+1,maxv,secondpass))
Index: src/typer.cpp
===================================================================
--- src/typer.cpp       (revision 4157)
+++ src/typer.cpp       (working copy)
@@ -243,7 +243,7 @@
     OBAtom *atom;
     vector<OBAtom*>::iterator k;
     for (atom = mol.BeginAtom(k);atom;atom = mol.NextAtom(k))
-      atom->SetImplicitValence(atom->GetValence());
+      atom->SetImplicitValence(atom->BOSum());
 
     vector<vector<int> >::iterator j;
     vector<pair<OBSmartsPattern*,int> >::iterator i;
@@ -261,8 +261,8 @@
 
     for (atom = mol.BeginAtom(k);atom;atom = mol.NextAtom(k))
       {
-        if (atom->GetImplicitValence() < atom->GetValence())
-          atom->SetImplicitValence(atom->GetValence());
+        if (atom->GetImplicitValence() < atom->BOSum())
+          atom->SetImplicitValence(atom->BOSum());
       }
 
     // FF Come back to the initial flags
@@ -513,7 +513,7 @@
     //sanity check - exclude all 4 substituted atoms and sp centers
     for (atom = mol.BeginAtom(i);atom;atom = mol.NextAtom(i))
       {
-        if (atom->GetImplicitValence() > 3)
+        if (atom->GetValence() > 3)
           {
             _vpa[atom->GetIdx()] = false;
             continue;
Index: test/files/formularesults.txt
===================================================================
--- test/files/formularesults.txt       (revision 4157)
+++ test/files/formularesults.txt       (working copy)
@@ -247,8 +247,8 @@
 C15H19NO9S 389.378 389.078
 C24H31NO15S 605.566 605.141
 C8H5BrO3PSe 338.961 338.832
-C5H12N2O3PS 211.199 211.031
-C3H10N2O3PS 185.162 185.015
+C5H11N2O3PS 210.191 210.023
+C3H9N2O3PS 184.154 184.007
 C10H21BN2O3 228.096 228.165
 C7H17BN2O3 188.032 188.133
 C4H13BN2O 115.97 116.112
------------------------------------------------------------------------------
Beautiful is writing same markup. Internet Explorer 9 supports
standards for HTML5, CSS3, SVG 1.1,  ECMAScript5, and DOM L2 & L3.
Spend less time writing and  rewriting code and more time creating great
experiences on the web. Be a part of the beta today.
http://p.sf.net/sfu/beautyoftheweb
_______________________________________________
OpenBabel-Devel mailing list
OpenBabel-Devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/openbabel-devel

Reply via email to