Hi there,
Here's a heads-up on some work I've been prototyping.
The aromatic atom typer currently uses SMARTS patterns in aromatic.txt
to assign max/min values of pi electrons. A more efficient approach is
to simultaneously match against all the SMARTS patterns rather than
one at a time, and well, to avoid using SMARTS at all.
I've attached a Python prototype that shows the general idea - see the
function getMinMax (the calls to IsAromatic will have to be removed,
but are unavoidable here; the "elif"s will become a switch statement;I
need to think some more about explicit hydrogens). To my mind, the use
of a direct lookup is as clear, if not clearer, than using SMARTS
patterns.
I note that the existing tests don't hit all of the patterns, and
while I can find molecules in ChEMBL that hit almost all of the
patterns, I'm not sure whether I can find ones where the corresponding
atom turns out to be aromatic in the end. I have a feeling this is
because the patterns were added in response to dodgy smiles (e.g.
using n instead of [nH]) which were reported or found by Geoff.
Regards,
- Noel
import pybel
aromatictext = """
#PATTERN MIN MAX
#carbon patterns
[#6rD2+0] 1
# exo ketone or alcohol -- don't know which
[#6rD3+0]~!@[#8] 2
[#6rD2+,#6rD3+] 3
[#6r+0]=@* 4
[#6rD3+0]=!@* 5
# external double bonds to hetero atoms contribute no electrons to the
# aromatic systems -- quinoid systems are non-aromatic, e.g. 1,4-benzoquinone
[#6rD3+0]=!@[!#6] 6
[#6rD3-] 7
#nitrogen patterns
[#7rD2+0] 8
[#7rD3+0] 9
[#7r+0](-@*)-@* 10
[#7rD2+0]=@* 11
[#7rD3+] 12
[#7rD3+0]=O 13
[#7rD2-] 14
#oxygen patterns
[#8r+0] 15
[#8r+] 16
#sulfur patterns
[#16rD2+0] 17
[#16rD2+] 18
[#16rD3+0]=!@O 19
#other misc patterns
# Accounts Chem Res 1978 11 p. 153
# phosphole, phosphabenzene (not v. aromatic)
[#15rD3+0] 20
[#15rD3]([#8r])[#8r] 21
# selenophene
[#34rD2+0] 22
# arsabenzene, etc. (*really* not v. aromatic)
#[#33rD3+0] 23
# tellurophene, etc. (*really* not v. aromatic)
#[#52rD2+0] 24
# stilbabenzene, etc. (very little aromatic character)
#[#51rD3+0] 25
"""
def HasAcyclicDblBond(atm):
for bond in pybel.ob.OBAtomBondIter(atm):
if bond.GetBondOrder()==2 and not bond.IsInRing():
return True
return False
def HasAcyclicDblBondToOxygen(atm):
for bond in pybel.ob.OBAtomBondIter(atm):
if bond.GetBondOrder()==2 and not bond.IsInRing() and bond.GetNbrAtom(atm).GetAtomicNum()==8:
return True
return False
def HasAcyclicDblBondToNonCarbon(atm):
for bond in pybel.ob.OBAtomBondIter(atm):
if bond.GetBondOrder()==2 and not bond.IsInRing() and bond.GetNbrAtom(atm).GetAtomicNum()!=6:
return True
return False
def HasAcyclicBondToOxygen(atm):
for bond in pybel.ob.OBAtomBondIter(atm):
if not bond.IsInRing() and bond.GetNbrAtom(atm).GetAtomicNum()==8:
return True
return False
def HasTwoSingleRingBonds(atm):
tot = 0
for bond in pybel.ob.OBAtomBondIter(atm):
if bond.GetBondOrder()==1 and bond.IsInRing() and not bond.IsAromatic():
if tot==1:
return True
tot += 1
return False
def HasTwoRingOxygens(atm):
tot = 0
for bond in pybel.ob.OBAtomBondIter(atm):
if bond.GetBondOrder()==1:
nbr = bond.GetNbrAtom(atm)
if nbr.IsInRing() and nbr.GetAtomicNum()==8:
if tot==1:
return True
tot += 1
return False
def HasRingDblBond(atm):
for bond in pybel.ob.OBAtomBondIter(atm):
if bond.GetBondOrder()==2 and bond.IsInRing() and not bond.IsAromatic():
return True
return False
def getMinMax(atom):
atm = atom.OBAtom
if not atm.IsInRing(): return 0
elem = atm.GetAtomicNum()
deg = atm.GetHvyValence()
chg = atm.GetFormalCharge()
if elem==6:
if chg==0:
if HasRingDblBond(atm):
return 4
if deg==2:
return 1
elif deg==3:
# merge the following three tests into one iteration over the acyclic bonds
if HasAcyclicDblBondToNonCarbon(atm):
return 6
if HasAcyclicDblBond(atm):
return 5
if HasAcyclicBondToOxygen(atm):
return 2
elif chg==1:
if deg==2 or deg==3:
return 3
elif chg==-1:
if deg==3:
return 7
elif elem==7:
if chg==0:
if deg==3 and HasAcyclicDblBondToOxygen(atm):
return 13
if HasTwoSingleRingBonds(atm):
return 10
if deg==2:
if HasRingDblBond(atm):
return 11
return 8
elif deg==3:
return 9
elif chg==-1:
if deg==2:
return 14
elif chg==1:
if deg==3:
return 12
elif elem==8:
if chg==0:
return 15
elif chg==1:
return 16
elif elem==15:
if deg==3:
if HasTwoRingOxygens(atm):
return 21
if chg==0:
return 20
elif elem==16:
if chg==0:
if deg==2:
return 17
elif deg==3:
if HasAcyclicDblBondToOxygen(atm):
return 19
elif chg==1:
if deg==2:
return 18
elif elem==34:
if chg==0 and deg==2:
return 22
return 0
class PatternMatcher():
def __init__(self):
pats = []
for line in aromatictext.split("\n"):
tmp = line.strip()
if not tmp or tmp[0]=='#': continue
broken = tmp.split()
pats.append( (pybel.Smarts(broken[0]), int(broken[1])) )
self.pats = pats
def match(self, mol):
ans = [0]*len(mol.atoms)
for pat, idx in self.pats:
pat.obsmarts.Match(mol.OBMol)
vector = pat.obsmarts.GetMapList() # using GetMapList() like in the C++ code
tmp = list(vector)
if not tmp: continue
for x in tmp:
ans[x[0]-1] = idx
tmp = []
for atom in mol.atoms:
tmp.append(getMinMax(atom))
return ans, tmp
class Processor:
def __init__(self):
self.matcher = PatternMatcher()
def handleMol(self, smi):
mol = pybel.readstring("smi", smi)
mol.removeh()
res = self.matcher.match(mol)
return res
if __name__ == "__main__":
proc = Processor()
smis = ["c1ccccc1", "c1ccccc1C", "c1ncccc1", "c1[nH]ccc1", "c1[n-]ccc1", "c1[n+](C)cc1",
"Cn1ccnc1O", "c1cssc1=O", "C[n+]1ccsc1", "c1cnn2c1NCC2", "c1[n-]ncn1", "c1csc2c1CCN=C2N",
"Cc1cccc2c1[s+]sn2", "C=C1CCc2ccccc2C1=O", "COc1cccc2c1NS(=O)S2", "CC1(COP(OC1)Oc2ccccc2)C",
"c1ccc2c(c1)Nc3ccccc3P2=O", "CCc1no[c+](n1C)c2ccccc2O", "CCCC[Se]1(c2ccccc2C[O+]1C)Br",
"C(Nc1c(n[c-]([n+](=O)c1N)N)N)O", "c1ccc-2c(c1)CCc3c2c(n(n3)c4ccccn4=O)O"]
seen = set()
for smi in smis:
a, b = proc.handleMol(smi)
print a, b, a==b
assert a==b
for x in a:
seen.add(x)
for line in open(r"C:\Tools\openbabel\baoilleach\test\files\aromatics.smi"):
smi = line.split()[0]
a, b = proc.handleMol(smi)
assert a==b
for x in a:
seen.add(x)
print seen
start = 0
for idx, line in enumerate(open("tmp_sorted.txt")):
if idx < start: continue
smi = line.split()[1]
if 'c' not in smi and 'n' not in smi: continue
a, b = proc.handleMol(smi)
if any(x not in seen for x in a):
print idx, line.rstrip()
print a
print
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
OpenBabel-Devel mailing list
OpenBabel-Devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/openbabel-devel