Hi there,

Here's a heads-up on some work I've been prototyping.

The aromatic atom typer currently uses SMARTS patterns in aromatic.txt
to assign max/min values of pi electrons. A more efficient approach is
to simultaneously match against all the SMARTS patterns rather than
one at a time, and well, to avoid using SMARTS at all.

I've attached a Python prototype that shows the general idea - see the
function getMinMax (the calls to IsAromatic will have to be removed,
but are unavoidable here; the "elif"s will become a switch statement;I
need to think some more about explicit hydrogens). To my mind, the use
of a direct lookup is as clear, if not clearer, than using SMARTS
patterns.

I note that the existing tests don't hit all of the patterns, and
while I can find molecules in ChEMBL that hit almost all of the
patterns, I'm not sure whether I can find ones where the corresponding
atom turns out to be aromatic in the end. I have a feeling this is
because the patterns were added in response to dodgy smiles (e.g.
using n instead of [nH]) which were reported or found by Geoff.

Regards,
- Noel
import pybel

aromatictext = """
#PATTERN		MIN	MAX

#carbon patterns
[#6rD2+0]		1
# exo ketone or alcohol -- don't know which
[#6rD3+0]~!@[#8]	2
[#6rD2+,#6rD3+]		3
[#6r+0]=@*		4
[#6rD3+0]=!@*		5
# external double bonds to hetero atoms contribute no electrons to the
# aromatic systems -- quinoid systems are non-aromatic, e.g. 1,4-benzoquinone
[#6rD3+0]=!@[!#6]	6
[#6rD3-]		7

#nitrogen patterns
[#7rD2+0]		8
[#7rD3+0]               9
[#7r+0](-@*)-@* 	10
[#7rD2+0]=@*		11
[#7rD3+]		12
[#7rD3+0]=O		13
[#7rD2-]		14

#oxygen patterns
[#8r+0]			15
[#8r+]			16

#sulfur patterns
[#16rD2+0]		17
[#16rD2+]		18
[#16rD3+0]=!@O		19

#other misc patterns
# Accounts Chem Res 1978 11 p. 153
# phosphole, phosphabenzene (not v. aromatic)
[#15rD3+0]		20
[#15rD3]([#8r])[#8r] 21
# selenophene
[#34rD2+0]		22
# arsabenzene, etc. (*really* not v. aromatic)
#[#33rD3+0]		23
# tellurophene, etc. (*really* not v. aromatic)
#[#52rD2+0]		24
# stilbabenzene, etc. (very little aromatic character)
#[#51rD3+0]		25
"""

def HasAcyclicDblBond(atm):
    for bond in pybel.ob.OBAtomBondIter(atm):
        if bond.GetBondOrder()==2 and not bond.IsInRing():
            return True
    return False

def HasAcyclicDblBondToOxygen(atm):
    for bond in pybel.ob.OBAtomBondIter(atm):
        if bond.GetBondOrder()==2 and not bond.IsInRing() and bond.GetNbrAtom(atm).GetAtomicNum()==8:
            return True
    return False

def HasAcyclicDblBondToNonCarbon(atm):
    for bond in pybel.ob.OBAtomBondIter(atm):
        if bond.GetBondOrder()==2 and not bond.IsInRing() and bond.GetNbrAtom(atm).GetAtomicNum()!=6:
            return True
    return False

def HasAcyclicBondToOxygen(atm):
    for bond in pybel.ob.OBAtomBondIter(atm):
        if not bond.IsInRing() and bond.GetNbrAtom(atm).GetAtomicNum()==8:
            return True
    return False

def HasTwoSingleRingBonds(atm):
    tot = 0
    for bond in pybel.ob.OBAtomBondIter(atm):
        if bond.GetBondOrder()==1 and bond.IsInRing() and not bond.IsAromatic():
            if tot==1:
                return True
            tot += 1
    return False

def HasTwoRingOxygens(atm):
    tot = 0
    for bond in pybel.ob.OBAtomBondIter(atm):
        if bond.GetBondOrder()==1:
            nbr = bond.GetNbrAtom(atm)
            if nbr.IsInRing() and nbr.GetAtomicNum()==8:
                if tot==1:
                    return True
                tot += 1
    return False

def HasRingDblBond(atm):
    for bond in pybel.ob.OBAtomBondIter(atm):
        if bond.GetBondOrder()==2 and bond.IsInRing() and not bond.IsAromatic():
            return True
    return False

def getMinMax(atom):
    atm = atom.OBAtom
    if not atm.IsInRing(): return 0
    elem = atm.GetAtomicNum()
    deg = atm.GetHvyValence()
    chg = atm.GetFormalCharge()
    if elem==6:
        if chg==0:
            if HasRingDblBond(atm):
                return 4
            if deg==2:
                return 1
            elif deg==3:
                # merge the following three tests into one iteration over the acyclic bonds
                if HasAcyclicDblBondToNonCarbon(atm):
                    return 6
                if HasAcyclicDblBond(atm):
                    return 5
                if HasAcyclicBondToOxygen(atm):
                    return 2
        elif chg==1:
            if deg==2 or deg==3:
                return 3
        elif chg==-1:
            if deg==3:
                return 7
    elif elem==7:
        if chg==0:
            if deg==3 and HasAcyclicDblBondToOxygen(atm):
                return 13
            if HasTwoSingleRingBonds(atm):
                return 10
            if deg==2:
                if HasRingDblBond(atm):
                    return 11
                return 8
            elif deg==3:
                return 9
        elif chg==-1:
            if deg==2:
                return 14
        elif chg==1:
            if deg==3:
                return 12
    elif elem==8:
        if chg==0:
            return 15
        elif chg==1:
            return 16
    elif elem==15:
        if deg==3:
            if HasTwoRingOxygens(atm):
                return 21
            if chg==0:
                return 20
    elif elem==16:
        if chg==0:
            if deg==2:
                return 17
            elif deg==3:
                if HasAcyclicDblBondToOxygen(atm):
                    return 19
        elif chg==1:
            if deg==2:
                return 18
    elif elem==34:
        if chg==0 and deg==2:
            return 22
    return 0

class PatternMatcher():
    def __init__(self):
        pats = []
        for line in aromatictext.split("\n"):
            tmp = line.strip()
            if not tmp or tmp[0]=='#': continue
            broken = tmp.split()
            pats.append( (pybel.Smarts(broken[0]), int(broken[1])) )
        self.pats = pats

    def match(self, mol):
        ans = [0]*len(mol.atoms)
        for pat, idx in self.pats:
            pat.obsmarts.Match(mol.OBMol)
            vector = pat.obsmarts.GetMapList() # using GetMapList() like in the C++ code
            tmp = list(vector)
            if not tmp: continue
            for x in tmp:
                ans[x[0]-1] = idx

        tmp = []
        for atom in mol.atoms:
            tmp.append(getMinMax(atom))

        return ans, tmp

class Processor:
    def __init__(self):
        self.matcher = PatternMatcher()
    def handleMol(self, smi):
        mol = pybel.readstring("smi", smi)
        mol.removeh()
        res = self.matcher.match(mol)
        return res

if __name__ == "__main__":
    proc = Processor()
    smis = ["c1ccccc1", "c1ccccc1C", "c1ncccc1", "c1[nH]ccc1", "c1[n-]ccc1", "c1[n+](C)cc1",
            "Cn1ccnc1O", "c1cssc1=O", "C[n+]1ccsc1", "c1cnn2c1NCC2", "c1[n-]ncn1", "c1csc2c1CCN=C2N",
            "Cc1cccc2c1[s+]sn2", "C=C1CCc2ccccc2C1=O", "COc1cccc2c1NS(=O)S2", "CC1(COP(OC1)Oc2ccccc2)C",
            "c1ccc2c(c1)Nc3ccccc3P2=O", "CCc1no[c+](n1C)c2ccccc2O", "CCCC[Se]1(c2ccccc2C[O+]1C)Br",
            "C(Nc1c(n[c-]([n+](=O)c1N)N)N)O", "c1ccc-2c(c1)CCc3c2c(n(n3)c4ccccn4=O)O"]
    seen = set()
    for smi in smis:
        a, b = proc.handleMol(smi)
        print a, b, a==b
        assert a==b
        for x in a:
            seen.add(x)

    for line in open(r"C:\Tools\openbabel\baoilleach\test\files\aromatics.smi"):
        smi = line.split()[0]
        a, b = proc.handleMol(smi)
        assert a==b
        for x in a:
            seen.add(x)
    print seen

    start = 0
    for idx, line in enumerate(open("tmp_sorted.txt")):
        if idx < start: continue
        smi = line.split()[1]
        if 'c' not in smi and 'n' not in smi: continue
        a, b = proc.handleMol(smi)
        if any(x not in seen for x in a):
            print idx, line.rstrip()
            print a
            print


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
OpenBabel-Devel mailing list
OpenBabel-Devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/openbabel-devel

Reply via email to