/***************************************************************************** *** cpfrags.pl - find the ClogP fragments -- or remaining *** "carbon-fragments" -- in a molecule. *** *** 1. Identify aromatic atoms and bonds as such using int property *** "arom". Set weights of atoms to 9[0-9]. *** *** 2. "De-aromatize" the molecule so the fragments will become *** legit dt_cansmiles-able "molecules". Aromatic bonds become *** single. *** *** 3. Find all isolating carbons (carbons not double or triple bonded *** to hetero atoms); dealloc all these, first adding wildcards *** where they were attached to non-ICs. *** *** 4. Call dt_mod_off and dt_cansmi for this disconnected mol. For *** each disconnected fragment, write SMILES, using perl *** postprocessing script to string substitute for weight-labeled atoms: *** [99C]->c [98N]->n, [98NH]->[nH], etc. *** *** Kekule Option: Instead of aromatic frags, let's write the Kekule *** versions so they'll be legal smiles. Aromatic bonds become *** single or double according to their bond order. ****************************************************************************** *** fiddle.pl - changes [99__] atoms to aromatic. *** *** s/\[9[0-9]C[H0-9]*\]/c/gi; *** s/\[9[0-9]N[H0-9]*\]/n/gi; *** s/\[9[0-9]O[H0-9]*\]/o/gi; *** s/\[9[0-9]S[H0-9]*\]/s/gi; *** s/\[9[0-9]Se[H0-9]*\]/se/gi; *** s/\[9[0-9]P[H0-9]*\]/p/gi; *** *** s/\[9[0-9]C([H0-9]*)([+-]+)\]/[c$1$2]/gi; *** s/\[9[0-9]N([H0-9]*)([+-]+)\]/[n$1$2]/gi; *** s/\[9[0-9]O([H0-9]*)([+-]+)\]/[o$1$2]/gi; *** s/\[9[0-9]S([H0-9]*)([+-]+)\]/[s$1$2]/gi; *** s/\[9[0-9]Se([H0-9]*)([+-]+)\]/[se$1$2]/gi; *** s/\[9[0-9]P([H0-9]*)([+-]+)\]/[p$1$2]/gi; ****************************************************************************** *** fiddle_kek.pl - Kekule fiddle *** *** s/\[99Si/[C/g; *** s/\[98Si/[N/g; *** s/\[97Si/[O/g; *** s/\[96Si/[S/g; *** s/\[95Si/[Se/g; *** s/\[94Si/[P/g; ****************************************************************************** *** Author: Jeremy Yang *** Rev: 1 Feb 2001 *****************************************************************************/ #include #include #include #include "dt_smiles.h" #include "dt_smarts.h" #include "du_utils.h" #define SMARTS_IC "[#6&!$(*=,#[!#6])]" /*** Isolating carbon smarts ***/ #define XOR(a,b) ((a) && !(b) || !(a) && (b)) dt_Handle stream2seq(dt_Handle stream); int sameword(char *p,char *q) {while (*p||*q) {if (tolower(*p++)!=tolower(*q++)) return 0;} return 1;} void help(void) { fprintf(stderr, "\t************************************************************\n" "\t| cpfrags - find the ClogP fragments in a molecule |\n" "\t************************************************************\n" "\t| |\n" "\t| cpfrags [opts] [out.smi] |\n" "\t| |\n" "\t| opts: |\n" "\t| -carbonfrags ... isolating carbon frags instead |\n" "\t| -kekule ... write kekule frags |\n" "\t| -help ... this help |\n" "\t| |\n" "\t|OUTPUT MUST BE POSTPROCESSED BY fiddle.pl or fiddle_kek.pl|\n" "\t************************************************************\n" "\t| Daylight CIS Inc. |\n" "\t| Toolkit Contrib Program |\n" "\t************************************************************\n"); exit(1); } char *PROG; main(int argc, char *argv[]) { dt_Handle mol,atoms,bonds,atom,bond,pathset,pattern,seq,sob,mols; dt_Handle pat_ic,pathset_ic; dt_Handle xatom,xbond,bord,atomseq,bondseq,newatom; char *p,*p2,*p3,*q; /*** generic char ptrs ***/ int i,j,k; /*** generic indices ***/ int ok,an; char *smi; /*** smiles string ***/ int len,count,fragcount; char buff[5000]; /*** all-purpose char buffer ***/ int cmode=0,kekule=0; int hcnt=0,oldhcnt=0;/*DEBUG*/ /************************************** *** Parse command-line **************************************/ for (PROG=*argv++; --argc; ++argv) { if (sameword(*argv, "-help")) { help(); } else if (sameword(*argv,"-carbonfrags")) { cmode = 1; fprintf(stderr,"Isolating-carbon frags mode...\n"); } else if (sameword(*argv,"-kekule")) { kekule = 1; fprintf(stderr,"Kekule mode...\n"); } else { fprintf(stderr,"ERROR: Bad option \"%s\"\n",*argv); help(); } } pat_ic = dt_smartin(strlen(SMARTS_IC),SMARTS_IC); if (NULL_OB==pat_ic) { fprintf(stderr,"ERROR(%s): dt_smartin error.\n",PROG); goto DONE; } for (count=0;NULL!=(smi=du_fgetline(&len,stdin));++count) { if (NULL_OB==(mol=dt_smilin(strlen(smi),smi))) goto NEXTSMI; atoms = dt_stream(mol,TYP_ATOM); bonds = dt_stream(mol,TYP_BOND); /******************************************** *** Match and label isolating carbons ********************************************/ pathset_ic = dt_match(pat_ic,mol,0); if (NULL_OB==pathset_ic) { if (cmode) goto NEXTSMI; else goto DONEFRAGGING; } while (NULL_OB!=(atom=dt_next(atoms))) { ok=dt_member(pathset_ic,atom); dt_setboolean(atom,2,"ic",ok); } /******************************************************* *** Tag each aromatic atom/bond. (After dt_mod_on, there *** is no aromaticity detection.) *** Mod-on so we can cut it up. *******************************************************/ for (dt_reset(atoms);NULL_OB!=(atom=dt_next(atoms));) { dt_setboolean(atom,4,"arom",dt_aromatic(atom)); } for (dt_reset(bonds);NULL_OB!=(bond=dt_next(bonds));) { dt_setboolean(bond,4,"arom",dt_aromatic(bond)); } dt_mod_on(mol); /******************************************************* *** De-aromatize each aromatic atom. Set weight=99. *** If not in Kekule mode set *** each aromatic bond single. If Kekule mode set *** bond type to bond order. Note that *** by setting bond-type, bond-order will follow; *** bond-order is derived at dt_mod_off. *******************************************************/ dt_reset(atoms); while (NULL_OB!=(atom=dt_next(atoms))) { if (dt_boolean(atom,4,"arom")) { dt_setaromatic(atom,0); switch(an=dt_number(atom)) { case (DX_ATN_C): dt_setweight(atom,99); break; case (DX_ATN_N): dt_setweight(atom,98); break; case (DX_ATN_O): dt_setweight(atom,97); break; case (DX_ATN_S): dt_setweight(atom,96); break; case (DX_ATN_Se): dt_setweight(atom,95); break; case (DX_ATN_P): dt_setweight(atom,94); break; default: fprintf(stderr,"ERROR(%s): Unknown aromatic atnum=%d\n",PROG,an); } if (kekule) dt_setnumber(atom,DX_ATN_Si); if (!kekule) dt_setimp_hcount(atom,0); } } dt_reset(bonds); while (NULL_OB!=(bond=dt_next(bonds))) { if (dt_boolean(bond,4,"arom")) { if (kekule) dt_setbondtype(bond,dt_bondorder(bond)); else dt_setbondtype(bond,DX_BTY_SINGLE); } } dt_dealloc(bonds); /******************************************************* *** Dealloc ICs, but first add wildcards where they *** were attached to non-ICs. ******************************************************** *** If carbonfrags mode, dealloc non-ICs -- reverse *** logic. *******************************************************/ atomseq = stream2seq(atoms); dt_dealloc(atoms); while (NULL_OB!=(atom=dt_next(atomseq))) { if (!XOR(cmode,dt_boolean(atom,2,"ic"))) { continue; } bondseq = stream2seq(bonds=dt_stream(atom,TYP_BOND)); dt_dealloc(bonds); while (NULL_OB!=(bond=dt_next(bondseq))) { xatom = dt_xatom(atom,bond); if (XOR(cmode,!dt_boolean(xatom,2,"ic"))) { newatom = dt_addatom(mol,DX_ATN_WILD,0); bord = kekule ? dt_bondorder(bond) : DX_BTY_SINGLE; xbond = dt_addbond(xatom,newatom,bord); } } dt_dealloc(bondseq); dt_dealloc(atom); } dt_dealloc(atomseq); if (!dt_mod_off(mol)) { fprintf(stderr,"ERROR(%s): dt_mod_off failed for \"%s\".\n",PROG,smi); mol=NULL_OB; goto NEXTSMI; } DONEFRAGGING: smi = dt_cansmiles(&len,mol,1); if (len<=0) goto NEXTSMI; for (p=smi;;p=q+1) { ++fragcount; q=strchr(p,'.'); if (q) { printf("%.*s\n",q-p,p); } else { printf("%s\n",p); break; } } NEXTSMI: if (NULL_OB!=mol) dt_dealloc(mol); if (DX_ERR_ERROR<=dt_errorworst()) { fprintf(stderr,"DEBUG: smi=\"%s\"\n",smi); } du_printerrors(stderr,DX_ERR_ERROR); dt_errorclear(); if ((hcnt=dt_vh_count())!=oldhcnt) /***DEBUG***/ { fprintf(stderr,"DEBUG: Handles in use = %d\n",dt_vh_count()); oldhcnt=hcnt; } } DONE: du_printerrors(stderr,DX_ERR_ERROR); dt_errorclear(); fprintf(stderr,"Results (%s): %d SMILES read.\n",PROG,count); fprintf(stderr,"Results (%s): %d fragments (non-unique) found.\n",PROG, fragcount); exit(0); } /******************************************************************** *** stream2seq - Copy a stream to a new sequence. ********************************************************************/ dt_Handle stream2seq(dt_Handle stream) { dt_Handle seq=dt_alloc_seq(),ob; dt_reset(stream); while (NULL_OB!=(ob=dt_next(stream))) { dt_append(seq,ob); } return seq; }