/***************************************************************************** *** gadd.c - GA-based Druglike Database evolver, using *** fragments and carbon-fragments (e.g., clogp-style) *** as building blocks. ***************************************************************************** *** Author: Jeremy Yang *** Rev: 12 Feb 2001 ***************************************************************************** *** Frag-smiles must have wildcards at attachment points, *** which may be single, double, or triple bonded. E.g., *** *** *O,*O*,*N(*)*,*[N]=*,*Cl,*OC(=O)*,*C(=O)*,*N*,*C(=O)O,*N (FRAGs) *** *** *C,*CC*,*CC,*C*,*[C]1=[C](*)[CH]=[CH][CH]=[CH]1 *** *[C]1=[CH][CH]=[C](*)[CH]=[CH]1,*[CH]=*,*[C](=*)* *** *[C]1=[CH][C](=[C](*)[CH]=[CH]1)*,*[C]1=[CH][CH]=[CH][CH]=[CH]1 (CFRAGs) *** *** We want Kekule smiles, that is, no aromatic *** atoms/bonds, thus no illegal smiles. (When these are canonicalized, *** there may be aromatic rings, which is ok.) ***************************************************************************** *** ALGORITHM: *** *** Create a Thor database. *** Load with an initial population of fragments (FRAGs) *** and carbon-fragments (CFRAGs) smiles. Call this generation zero. *** AND... (NEW) load ringsystem fragments as either *** FRAGs or CFRAGs or both, depending on the presence of IC or non-IC *** attachment points. In general, these are often both (UMOL-like) but *** we want to mate them with each other as well as UMOLs and other *** fragments. *** *** smiles in database may be: *** 1. FRAGs: fragments (odd id *) *** 2. CFRAGs: carbon fragments (even id *) *** 3. UMOLs: unfinished molecules (1+ FRAG, 1+ CFRAG, 1+ attachments) *** 4. FMOLs: finished molecules (1+ FRAG, 1+ CFRAG, 0 attachments) *** 5. RINGs, labelled as FRAGs and/or CFRAGs. *** *** LOOP until a QUIT CONDITION: *** For each smiles in database, attempt one mating with another smiles: *** - FRAGs mate with CFRAGs. *** - CFRAGs mate with FRAGs. *** - UMOLs mate with FRAGs or CFRAGs. *** - FMOLs don't mate. *** Select fragments with probability distribution reflecting the *** composition of the seed database. *** Mating is accomplished with one of three smirks for single, *** double, and triple attachment points. *** Zap any leftover disconnected wildcard atoms. *** Write to database each new, fit SMILES (normally UMOL at this stage). *** Test each UMOL for "loose" fitness. *** Convert UMOLs to FMOLs by replacing remaining *'s with hydrogens. *** Test each FMOL for final "tight" fitness. *** Write to database each new, fit FMOL SMILES. *** Optionally, sample fit SMILES. *** Report new (N[I],SMI[I],UMOL[I]) for database. *** Note: N[I] = number of FMOLs *** QUIT CONDITIONS: *** - population limit reached (N[I]==Nmax) *** - generation limit reached (I==Imax) *** - population converges (N[I]==N[I-1]) *** ENDGAME: *** Delete all UMOLs. *** All final smiles in db should be FRAGs, CFRAGs, or FMOLs. *** *** FITNESS CONDITIONS: *** Mol Weight [200-500] *** HB donors [0-5] *** HB acceptors [0-10] *** Rotatable bonds [0-8] *** Heavy Atoms [15-50] *** ClogP [-10 to +5] *** Charge [-2 to +2] *** Reject matches with -badsmarts file. *** ***************************************************************************** *** To do: *** Crunch db in endgame. *** < > Performance improvement: avoiding repeatedly generating the *** same products --how? *** < > Efficiency improvement: don't waste disk space. Don't store *** frag-smiles for UMOLs, only FMOLs. Maybe frag-smiles should *** be indirect? *** Test the exponential distribution frag choice with data. *** Do the unlikely frags get used enough? At all? *** AHA! It ain't ok -- switch to inverse (1/x) distribution. *** Nope -- cumulative probability arrays achieve original goal. *** Provide no-frag-weighting option. *** < > Better/more reporting. How many frags/cfrags used? *** < > Verify that double & triple joins are happening. *** Use ringsystems for fragments (actually gen-zero UMOLs). *** Choose biggest piece of disconnected products. *** Fix bug: when the new bond is intra-UMOL, the new frag is added *** to the fraglist when it shouldn't be. Possible way: *** Separate intra-frag bonding from inter-frag bonding *** steps (w/ different smirks, 0-level parens). *** Maybe no intra-frag bonding at all (done). ***************************************************************************** *** To consider: *** < > Fitsmarts: optional file of required/desireable smarts. *** < > 3D fitness: rubicon and pharmacaphore-ish stuff. *** < > Quantify/store fitness score; periodically cull the unfittest. *** < > Additional smirks to mutate mols (goforth-like). *** < > Increase likelihood of ring formation. *** Badsmarts: filter large rings *** < > Degrees of fitness, probabilistic selection. *** < > Add "RING" status (e.g., STATUS). *** < > Unclosed partial-ring [C]FRAGs may not make sense (e.g., *CCCCC*). *****************************************************************************/ #include #include #include #include #include #include "dt_smiles.h" #include "dt_smarts.h" #include "dt_thor.h" #include "dt_progob.h" #include "du_utils.h" #define DATATYPESFILE "datatypes.tdt" #define DEFAULT_DBSIZE 5000 #define DEFAULT_NMAX 5000 #define DEFAULT_IMAX 12 #define DEFAULT_MIN_WT 200 #define DEFAULT_MAX_WT 500 #define DEFAULT_MIN_CHARGE -2 #define DEFAULT_MAX_CHARGE 2 #define DEFAULT_MIN_RB 0 #define DEFAULT_MAX_RB 8 #define DEFAULT_MIN_HBD 0 #define DEFAULT_MAX_HBD 5 #define DEFAULT_MIN_HBA 0 #define DEFAULT_MAX_HBA 10 #define DEFAULT_MIN_HEAVY 11 #define DEFAULT_MAX_HEAVY 50 #define DEFAULT_MIN_CP -10.0 #define DEFAULT_MAX_CP 5.0 #define SMARTS_HBD "[!#6;!H0]" #define SMARTS_HBA "[$([!#6;+0]);!$([F,Cl,Br,I]);!$([o,s,nX3]);!$([Nv5,Pv5,Sv4,Sv6])]" #define SMARTS_RB "[!$(*#*)&!D1]-&!@[!$(*#*)&!D1]" #define SMIRKS_ADD_H "[*:1][#0:2]>>[*:1][H].[#0:2]" /*** Inter-frag bond-forming smirks (one frag to another) ***/ #define SMIRKS_JOIN1 "([*;#6;!$([#6]=,#[!#6&!#0]):1]-[#0:3]).([*;!#6,$([#6]=,#[!#6&!#0]):2]-[#0:4])>>[*:1]-[*:2].[*:3].[*:4]" #define SMIRKS_JOIN2 "([*;#6;!$([#6]=,#[!#6&!#0]):1]=[#0:3]).([*;!#6,$([#6]=,#[!#6&!#0]):2]=[#0:4])>>[*:1]=[*:2].[*:3].[*:4]" #define SMIRKS_JOIN3 "([*;#6;!$([#6]=,#[!#6&!#0]):1]#[#0:3]).([*;!#6,$([#6]=,#[!#6&!#0]):2]#[#0:4])>>[*:1]#[*:2].[*:3].[*:4]" #define DX_PT_ERTEXT "Qwerty: Say ERRORTEXT." #define DX_PT_FRAGDB "Qwerty: Say FRAGDB." #define DX_PT_TABLE "Qwerty: Set TABLEOUTPUT." #define DX_PT_NOTABLE "Qwerty: Set NOTABLEOUTPUT." #define DX_PT_INTERR "Qwerty: Set INTEGERERRORS." #define DX_PT_ENGERR "Qwerty: Set ENGLISHERRORS." #define GENMAX 102 /*** Max allowed Imax is GENMAX-2 ***/ #define PCT(a,b) 100.0 * (float) a / (float) b #define MAX(a,b) ((a) > (b) ? (a) : (b)) int initialize(void); int create_db(char *message); int open_db(void); int load_datatypes(dt_Handle dtdb); int load_fragments(void); int fitness(char *smi,dt_Handle mol,char *level); int rand_id(int mode,int nofavorites,int sum[]); float mwt(dt_Handle mol); int molcharge(dt_Handle mol); int matchpat(dt_Handle mol,dt_Handle pattern); dt_Handle getprogram(char *path,dt_Handle *name,dt_Handle *notice); void fprintsos(FILE *f,dt_Handle sos); void report(char *type); int endgame(void); int finish(char *tdtstr); void zapseq(dt_Handle seq); char *bigpart(char *smi); int sameword(char *p,char *q) {while (*p||*q) {if (tolower(*p++)!=tolower(*q++)) return 0;} return 1;} /************************* GLOBAL VARIABLES *************************/ char *PROG; /*** name of this program ***/ FILE *ffrags=NULL; /*** input frags file ptr ***/ FILE *fcfrags=NULL; /*** input cfrags file ptr ***/ FILE *fseed=NULL; /*** input UMOLs file ptr ***/ char *fragsfile=NULL; /*** input frags file ***/ char *cfragsfile=NULL; /*** input cfrags file ***/ char *seedfile=NULL; /*** input UMOLs file ***/ int dbsize=DEFAULT_DBSIZE; /*** database size ***/ int dofitness=1; /*** apply fitness tests ***/ int quiet=0; /*** quiet progress reports ***/ int N[GENMAX]; /*** population size/generation ***/ int M[GENMAX]; /*** products/generation ***/ int Mfit[GENMAX]; /*** fit products/generation ***/ int Mok[GENMAX]; /*** fit/new/ok products/generation ***/ int UMOL[GENMAX]; /*** UMOLs/generation ***/ int SMI[GENMAX]; /*** UMOLs/generation ***/ int I=0; /*** generation ***/ int pcount=0; /*** total product count ***/ int fitcount=0; /*** total fit product count ***/ int smi_ok=0; /*** total fit/new/ok product count ***/ int errorcount=0; /*** total error count ***/ int min_wt=DEFAULT_MIN_WT; int max_wt=DEFAULT_MAX_WT; int min_charge=DEFAULT_MIN_CHARGE; int max_charge=DEFAULT_MAX_CHARGE; int min_rb=DEFAULT_MIN_RB; int max_rb=DEFAULT_MAX_RB; int min_hbd=DEFAULT_MIN_HBD; int max_hbd=DEFAULT_MAX_HBD; int min_hba=DEFAULT_MIN_HBA; int max_hba=DEFAULT_MAX_HBA; int min_heavy=DEFAULT_MIN_HEAVY; int max_heavy=DEFAULT_MAX_HEAVY; float min_cp=DEFAULT_MIN_CP; float max_cp=DEFAULT_MAX_CP; char *badsmarts=NULL; /*** bad smarts filename ***/ dt_Handle *badpat=NULL; /*** array of bad patterns ***/ int nbadsmarts=0; /*** # of bad smarts/patterns ***/ char *dbspec=NULL; char *tdtstr=NULL; /*** tdt string (malloc-ed) ***/ dt_Handle server; /*** Thor server ***/ dt_Handle db; /*** database ***/ dt_Handle dtdb; /*** datatypes database ***/ dt_Handle smitype; /*** smiles datatype ***/ dt_Handle idtype; /*** $ID datatype ***/ dt_Handle pat_rb; /*** rotatable bond pattern ***/ dt_Handle pat_hbd; /*** h-bond donor pattern ***/ dt_Handle pat_hba; /*** h-bond acceptor pattern ***/ dt_Handle xfrm_join1; /*** reaction transform 1 (single) ***/ dt_Handle xfrm_join2; /*** reaction transform 2 (double) ***/ dt_Handle xfrm_join3; /*** reaction transform 3 (triple) ***/ dt_Handle xfrm_add_h; /*** reaction transform (U->FMOL) ***/ dt_Handle xfrm; /*** reaction transform ***/ int fragcount=0,cfragcount=0; int maxfragid,maxcfragid; int *sumf=NULL,*sumc=NULL; /*** cumulative probability arrays ***/ int nofavorites=0; int test_q_ok=0,test_n_ok=0,test_wt_ok=0,test_rb_ok=0,test_hbd_ok=0, test_hba_ok=0,test_bad_ok=0,test_cp_ok=0; int test_q=0,test_n=0,test_wt=0,test_rb=0,test_hbd=0, test_hba=0,test_bad=0,test_cp=0; void help() { fprintf(stderr, "\t************************************************************\n" "\t| gadd - GA-based Druglike Database evolver using |\n" "\t| clogp-type frags/cfrags. |\n" "\t************************************************************\n" "\t| |\n" "\t| gadd [opts] -frags -cfrags |\n" "\t| |\n" "\t| -frags ... input frags smiles file (reqd) |\n" "\t| -cfrags ... input cfrags smiles file (reqd) |\n" "\t| opts: |\n" "\t| -db ... Thor db to create |\n" "\t| -dbsize ... Size of db to create |\n" "\t| -nmax ... Maximum population to achieve |\n" "\t| -genmax ... Maximum number of generations |\n" "\t| -sample ... Select only every jth hit |\n" "\t| -nofraglist ... Don't record frags used |\n" "\t| -message ... Description of experiment |\n" "\t| -seed ... initial file of UMOLs |\n" "\t| -noendgame ... don't cleanup (leave UMOLs) |\n" "\t| -endgameonly ... just cleanup existing gadd db |\n" "\t| -nofavorites ... don't weight fragment selection |\n" "\t| -quiet ... minimal progress reports |\n" "\t| -help ... this help |\n" "\t| |\n" "\t| fitness opts: |\n" "\t| -min_wt MINWT ... min molweight [200] |\n" "\t| -max_wt MAXWT ... max molweight [500] |\n" "\t| -min_charge MINQ ... min total formal charge [-2] |\n" "\t| -max_charge MAXQ ... max total formal charge [+2] |\n" "\t| -min_rb MINRB ... min rotatable bond count [0] |\n" "\t| -max_rb MAXRB ... max rotatable bond count [8] |\n" "\t| -min_hbd MINHBD .... min h-bond donor sites [0] |\n" "\t| -max_hbd MAXHBD .... max h-bond donor sites [5] |\n" "\t| -min_hba MINHBA .... min h-bond acceptor sites [0] |\n" "\t| -max_hba MAXHBA .... max h-bond acceptor sites [10] |\n" "\t| -min_cp MINCP .... min clogp value [-10] |\n" "\t| -max_cp MAXCP .... max clogp value [+5] |\n" "\t| -min_heavy N .... min heavy atoms [15] |\n" "\t| -max_heavy N .... max heavy atoms [50] |\n" "\t| |\n" "\t| -nofitness ... Ignore fitness considerations |\n" "\t| -badsmarts ... Reject matches with SMARTS |\n" "\t| |\n" "\t************************************************************\n" "\t| Daylight CIS Inc. |\n" "\t| Toolkit Contributed Program |\n" "\t************************************************************\n"); exit(1); } main(int argc, char *argv[]) { dt_Handle mol,pathset,pattern,tdt_h,tdt_h2,seq; dt_Handle reacseq,reac,mols,pmol,tdt_stream,sob; int Nmax=DEFAULT_NMAX; /*** maximum population ***/ int Imax=DEFAULT_IMAX; /*** generation limit ***/ char *smiles; /*** smiles string ***/ int ok; int sample=0; /*** sample? (0->no, n->1/n) ***/ int i,j,k; /*** generic indices ***/ float cp; /*** clogp ***/ char *p,*p2,*p3,*q; /*** generic char ptrs ***/ char *str; /*** generic string ***/ int store_fraglist=1; char *smi1,*smi2,*cansmi,*tdt; char *message=NULL; char *fraglist,*fragids,*fragsmis; char *status,*prodstatus; int tdtstrsize=0; int id,id1,id2; int len; int gen1; int mode,max; int errcode; int isnew; int hcnt=0,oldhcnt=0;/*DEBUG*/ char buff[5000]; /*** all-purpose char buffer ***/ int noendgame=0; int endgameonly=0; time_t t0,t1,tfinal; int h,m; double t; tdtstrsize=1000; tdtstr = (char *)malloc(tdtstrsize*sizeof(char)); /************************************** *** Parse command-line **************************************/ for (PROG=*argv++; --argc; ++argv) { if (sameword(*argv, "-help")) help(); else if (sameword(*argv,"-frags")) { fragsfile = *++argv; --argc; } else if (sameword(*argv,"-cfrags")) { cfragsfile = *++argv; --argc; } else if (sameword(*argv,"-seed")) { seedfile = *++argv; --argc; } else if (sameword(*argv,"-db")) { dbspec = *++argv; --argc; } else if (sameword(*argv,"-dbsize")) { dbsize = atoi(*++argv); --argc; } else if (sameword(*argv,"-nmax")) { Nmax = atoi(*++argv); --argc; } else if (sameword(*argv,"-genmax")) { Imax = atoi(*++argv); --argc; } else if (sameword(*argv,"-sample")) { sample = atoi(*++argv); --argc; } else if (sameword(*argv,"-nofitness")) { dofitness = 0; } else if (sameword(*argv,"-endgameonly")) { endgameonly = 1; } else if (sameword(*argv,"-noendgame")) { noendgame = 1; } else if (sameword(*argv,"-nofraglist")) { store_fraglist = 0; } else if (sameword(*argv,"-nofavorites")) { nofavorites = 1; } else if (sameword(*argv,"-message")) { message = *++argv; --argc; } else if (sameword(*argv,"-quiet")) { quiet = 1; } else if (sameword(*argv,"-badsmarts")) { badsmarts = *++argv; --argc; } else if (sameword(*argv,"-min_heavy")) { min_heavy = atof(*++argv); --argc; } else if (sameword(*argv,"-max_heavy")) { max_heavy = atof(*++argv); --argc; } else if (sameword(*argv,"-min_wt")) { min_wt = atof(*++argv); --argc; } else if (sameword(*argv,"-max_wt")) { max_wt = atof(*++argv); --argc; } else if (sameword(*argv,"-min_charge")) { min_charge = atoi(*++argv); --argc; } else if (sameword(*argv,"-max_charge")) { max_charge = atoi(*++argv); --argc; } else if (sameword(*argv,"-min_rb")) { min_rb = atoi(*++argv); --argc; } else if (sameword(*argv,"-max_rb")) { max_rb = atoi(*++argv); --argc; } else if (sameword(*argv,"-min_hbd")) { min_hbd = atoi(*++argv); --argc; } else if (sameword(*argv,"-max_hbd")) { max_hbd = atoi(*++argv); --argc; } else if (sameword(*argv,"-min_hba")) { min_hba = atoi(*++argv); --argc; } else if (sameword(*argv,"-max_hba")) { max_hba = atoi(*++argv); --argc; } else if (sameword(*argv,"-min_cp")) { min_cp = atof(*++argv); --argc; } else if (sameword(*argv,"-max_cp")) { max_cp = atof(*++argv); --argc; } else { fprintf(stderr,"ERROR: Bad option \"%s\"\n",*argv); help(); } } time(&t0); fputs(ctime(&t0),stdout); if (message) printf("message: %s\n",message); printf("NOTE: max N: %d\n",Nmax); printf("NOTE: max generations: %d\n",Imax); printf("FITNESS CONSTRAINTS:\n"); printf("\tmolweight: %d to %d\n",min_wt,max_wt); printf("\tcharge: %d to %d\n",min_charge,max_charge); printf("\trotatable bonds: %d to %d\n",min_rb,max_rb); printf("\tH-bond donors: %d to %d\n",min_hbd,max_hbd); printf("\tH-bond acceptors: %d to %d\n",min_hba,max_hba); printf("\theavy atoms: %d to %d\n",min_heavy,max_heavy); printf("\tclogp: %.2f to %.2f\n",min_cp,max_cp); if (!dbspec) { dbspec = "gadd"; printf("NOTE(%s): No db name specified; using \"%s\".\n",PROG,dbspec); } if (endgameonly) { if (!open_db()) exit(1); goto BYEBYE; } if (Imax>GENMAX-2) { printf("ERROR(%s): GENMAX exceeded (%d>%d)...\n",PROG,Imax,GENMAX-2); help(); } if (!fragsfile || !cfragsfile) { printf("ERROR(%s): FRAGs & CFRAGs files must be specified...\n",PROG); help(); } if (!initialize()) exit(1); if (!create_db(message)) exit(1); /*** creates db & dtdb ***/ if (!load_datatypes(dtdb)) exit(1); /*** from DATATYPESFILE ***/ if (!load_fragments()) exit(1); /*** [C]FRAGs & seed UMOLs (GEN=0) ***/ I=1; /*** Generation 1: The first products. ***/ N[1]=M[1]=Mfit[1]=Mok[1]=0; SMI[1]=SMI[0]=fragcount+cfragcount+UMOL[1]; MAINLOOP: /** Main loop **/ /***************************************************************** *** FOR EACH GENERATION: *** For each smiles in db, attempt one mating: *** - FRAGs mate with CFRAGs. *** - CFRAGs mate with FRAGs. *** - UMOLs (unfinished) mate with FRAGs or CFRAGs. *** - FMOLs (finished) don't mate. *** For resulting products: *** - New UMOLs are stored. *** - New UMOLs are converted to FMOLs and stored if fit. *****************************************************************/ time(&t1); printf("GENERATION %d (%.*s)...\n",I,24,ctime(&t1)); tdt_stream = dt_stream(db,TYP_DATATREE); for (i=0;NULL_OB!=(tdt_h=dt_next(tdt_stream));++i) { reacseq=mol=NULL_OB; status=fraglist=smi2=NULL; id1=-1; gen1=0; p = dt_thor_tdt2str(&len,tdt_h,0); tdt = du_strndup(p,len); dt_dealloc(tdt_h); /*** Realloc tdtstr to twice the incoming tdt size (kludgy). ***/ if (tdtstrsize<(2*len)) { tdtstrsize*=2; tdtstr = (char *)realloc(tdtstr,tdtstrsize*sizeof(char)); } if (NULL!=(p=strstr(tdt,"STATUS<"))) { q=strchr(p,'>'); status = du_strndup(p+7,(q-p-7)); } if (NULL!=(p=strstr(tdt,"FRAGLIST<"))) { q=strchr(p,'>'); fraglist = du_strndup(p+9,(q-p-9)); fragids=fraglist; p = strchr(fraglist,';'); *p='\0'; fragsmis=p+1; } if (NULL!=(p=strstr(tdt,"$ID<"))) { q=strchr(p,'>'); p2 = du_strndup(p+4,(q-p-4)); id1 = atoi(p2); free(p2); } smi1=tdt+5; p = strchr(smi1,'>'); *p='\0'; if (!status) { printf("ERROR(%s): no status for smi=%s.\n",PROG,status,smi1); goto NEXTTDT;; } else if (sameword(status,"FRAG")) { id2 = rand_id(0,nofavorites,sumc); gen1 = 1; } else if (sameword(status,"CFRAG")) { id2 = rand_id(1,nofavorites,sumf); gen1 = 1; } else if (sameword(status,"UMOL")) { mode = (rand() < RAND_MAX/2) ? 1 : 0; id2 = rand_id(mode,nofavorites,mode?sumf:sumc); } else if (sameword(status,"FMOL")) { goto NEXTTDT;; } else { printf("ERROR(%s): bad status=%s for smi=%s.\n",PROG,status,smi1); goto NEXTTDT;; } sprintf(buff,"%d",id2); seq = dt_thor_xrefget(db,idtype,strlen(buff),buff,&errcode); if (NULL_OB==seq) { printf("ERROR(%s): Can't find frag for id=%d.\n",PROG,id2); ++errorcount; goto NEXTTDT; } if (2STATUS<%s>",cansmi,prodstatus); if (store_fraglist) { sprintf(buff,"%d",id1); sprintf(tdtstr+k,"FRAGLIST<%s,%d;%s.%s>|", (gen1?buff:fragids),id2,(gen1?smi1:fragsmis),smi2); } else { sprintf(tdtstr+k,"|"); } tdt_h2 = dt_thor_str2tdt(db,strlen(tdtstr),tdtstr,1); j = dt_thor_tdtput(tdt_h2,DX_THOR_OVERWRITE); if (j>0) { ++smi_ok; ++Mok[I]; ++SMI[I]; if (sameword(prodstatus,"UMOL")) ++UMOL[I]; else if (sameword(prodstatus,"FMOL")) ++N[I]; if (N[I]>=Nmax) { goto DONE; } } /********************************************************** *** Finish UMOL to FMOL. **********************************************************/ if (sameword(prodstatus,"UMOL")) { finish(tdtstr); } NEXTPRODUCT: if (cansmi) free(cansmi); if (NULL_OB!=tdt_h2) dt_dealloc(tdt_h2); } } NEXTTDT: if (NULL_OB!=reacseq) zapseq(reacseq); if (NULL_OB!=mol) dt_dealloc(mol); if (status) free(status); if (fraglist) free(fraglist); if (tdt) free(tdt); if (smi2) free(smi2); /******************DEBUG****************************/ if (DX_ERR_ERROR<=dt_errorworst()){printf("DEBUG: NEXTTDT {\n"); du_printerrors(stdout,DX_ERR_ERROR);dt_errorclear();puts("\t}\n");/*DEBUG*/} if ((hcnt=dt_vh_count())!=oldhcnt) { printf("DEBUG: Handles in use = %d\n",dt_vh_count()); oldhcnt=hcnt; } /******************DEBUG****************************/ du_printerrors(stderr,DX_ERR_ERROR); dt_errorclear(); if (!quiet && !(i%1000)) {fprintf(stderr,"%d..",i);fflush(stderr);} /***progress***/ if (!dt_ping(server,7,"whassup")) { /*** Connection ok? ***/ fprintf(stderr,"ERROR(%s): Lost server; exiting.\n",PROG); exit(1); } } DONE: dt_dealloc(tdt_stream); report("normal"); N[I+1] = N[I]; UMOL[I+1] = UMOL[I]; SMI[I+1] = SMI[I]; if (I>=Imax) { printf("%s exit condition: generation %d reached.\n",PROG,Imax); goto BYEBYE; } else if (N[I]>=Nmax) { printf("%s exit condition: population limit %d reached.\n",PROG,Nmax); goto BYEBYE; } ++I; goto MAINLOOP; BYEBYE: ++I; printf("Count of TDTs in database: %d\n",dt_count(db,TYP_DATATREE)); if (noendgame) { printf("NOTE(%s): Skipping endgame; UMOLs remain in db.\n",PROG); } else { endgame(); } report("final"); printf("Final count of TDTs in %s: %d\n",dbspec,dt_count(db,TYP_DATATREE)); fprintf(stderr,"thorcrunch-ing %s...\n",dbspec); dt_thor_crunchdata(db); dt_dealloc(db); dt_dealloc(server); du_printerrors(stderr,DX_ERR_ERROR); if (sumf) free(sumf); if (sumc) free(sumc); t = difftime(tfinal=time(&tfinal),t0); h = (int)(t/3600.0); t -= 3600.0 * (double)h; m = (int)(t/60.0); t -= 60.0 * (double)m; printf("Total time elapsed: %d:%d:%d\n",h,m,(int)t); printf("%s errors: %d\n",PROG,errorcount); printf("%s is done. Adios!\n",PROG); exit(0); } /***************************************************************************** *** finish - convert UMOL to FMOL. *****************************************************************************/ int finish(char *tdt) { dt_Handle tdt_stream=NULL_OB; dt_Handle tdt_h1=NULL_OB; dt_Handle tdt_h2=NULL_OB; dt_Handle reacseq=NULL_OB; dt_Handle reac=NULL_OB; dt_Handle mol=NULL_OB; dt_Handle pmol=NULL_OB; dt_Handle mols=NULL_OB; char *p,*q,*smi1,*fraglist,*smi2; int len,i=0,ok,isnew; char *cansmi=NULL; char *tdtstr2=NULL; tdtstr2=du_strdup(tdt); smi1 = strstr(tdtstr2,"$SMI<") + 5; q = strchr(smi1,'>'); *q = '\0'; fraglist = strstr(q+1,"FRAGLIST<"); if (fraglist) { q=strchr(fraglist+=9,'>'); *q='\0'; } mol = dt_smilin(strlen(smi1),smi1); if (NULL_OB==mol) goto DONE; /****************************************************** *** Transform exhaustively the mol, producing a sequence *** of one reaction. ******************************************************/ reacseq = dt_xtransform(xfrm_add_h,mol,DX_FORWARD,0); if (NULL_OB==reacseq) goto DONE; if (11 reac from dt_xtransform");} reac=dt_next(reacseq); if (NULL_OB==reac) goto DONE; /****************************************************** *** Find the products. This transform should only produce *** one product from one reactant in one reaction. ******************************************************/ mols = dt_stream(reac,TYP_MOLECULE); if (NULL_OB==mols) goto DONE; do { pmol=dt_next(mols); } while (NULL_OB!=pmol && DX_ROLE_PRODUCT!=dt_getrole(pmol,reac)); p = dt_cansmiles(&len,pmol,1); if (NULL_OB==pmol || len<=0 || !p) goto DONE; cansmi = du_strndup(p,len); /****************************************************** *** Delete any disconnected wildcard atoms (leading or trailing). *** If any wildcards remaining, discard. ******************************************************/ smi2=cansmi; while (0==strncmp(smi2,"*.",2)) { smi2+=2; } len = strlen(smi2); while (0==strcmp(smi2+len-2,".*")) { smi2[len-=2] = '\0'; } if (strchr(smi2,'*')) goto DONE; /****************************************************** *** Check that SMILES not yet in DB. ******************************************************/ tdt_h2 = dt_thor_tdtget(db,smitype,strlen(smi2),smi2,0,&isnew); if (NULL_OB!=tdt_h2) goto DONE; /****************************************************** *** Check fitness (tight). ******************************************************/ ok=1; if (dofitness) { if (ok=fitness(smi2,pmol,"normal")) { ++fitcount; ++Mfit[I]; } } if (!ok) goto DONE; /****************************************************** *** Form TDT string. ******************************************************/ sprintf(tdtstr,"$SMI<%s>STATUSFRAGLIST<%s>|",smi2, fraglist?fraglist:"n/a"); tdt_h2 = dt_thor_str2tdt(db,strlen(tdtstr),tdtstr,1); if (0 < dt_thor_tdtput(tdt_h2,DX_THOR_OVERWRITE)) { ++smi_ok; ++Mok[I]; ++N[I]; ++SMI[I]; } else { fprintf(stderr,"ERROR: Failed to load finish() tdt.\n"); ++errorcount; } DONE: if (tdt_h1!=NULL_OB) dt_dealloc(tdt_h1); if (mol!=NULL_OB) dt_dealloc(mol); if (reacseq!=NULL_OB) zapseq(reacseq); if (tdt_h2!=NULL_OB) dt_dealloc(tdt_h2); if (cansmi) free(cansmi); if (tdtstr2) free(tdtstr2); return(1); } /***************************************************************************** *** endgame - Clean up database by deleting UMOLs. All UMOLs should *** already have been converted to FMOLs in main(). *****************************************************************************/ int endgame(void) { dt_Handle tdt_stream,tdt_h1; char *tdt,*p; int len,i=0; time_t t1; time(&t1); printf("ENDGAME/%d (%.*s)...\n",I,24,ctime(&t1)); tdt_stream = dt_stream(db,TYP_DATATREE); for (i=0;NULL_OB!=(tdt_h1=dt_next(tdt_stream));++i) { tdt=NULL; p = dt_thor_tdt2str(&len,tdt_h1,0); tdt = du_strndup(p,len); if (NULL==strstr(tdt,"STATUS")) goto NEXTTDT; /****************************************************** *** Delete UMOL TDT. ******************************************************/ if (0max_charge) return 0; test_q_ok++; /********************************************************** *** HEAVYATOM COUNT TEST **********************************************************/ test_n++; n = dt_count(mol,TYP_ATOM); if (n<(loosefit?0:min_heavy) || n>max_heavy) return 0; test_n_ok++; /********************************************************** *** MOLWEIGHT TEST **********************************************************/ test_wt++; wt = mwt(mol); if (wt<(loosefit?0.0:min_wt) || wt>max_wt) return 0; test_wt_ok++; /********************************************************** *** ROTATABLE BONDS TEST **********************************************************/ test_rb++; rb = matchpat(mol,pat_rb); if (rb<(loosefit?0:min_rb) || rb>max_rb) return 0; test_rb_ok++; /********************************************************** *** H-BOND DONOR TEST **********************************************************/ test_hbd++; hbd = matchpat(mol,pat_hbd); if (hbd<(loosefit?0:min_hbd) || hbd>max_hbd) return 0; test_hbd_ok++; /********************************************************** *** H-BOND ACCEPTOR TEST **********************************************************/ test_hba++; hba = matchpat(mol,pat_hba); if (hba<(loosefit?0:min_hba) || hba>max_hba) return 0; test_hba_ok++; /********************************************************** *** BAD-SMARTS TEST **********************************************************/ for (i=0;i=60 || clogpmax_cp) { return(0); } } test_cp_ok++; return(1); /*** PASSED ALL FITNESS TESTS. ***/ } /***************************************************************************** *** matchpat - returns the number of unique-set-of-atoms pattern matches *** (like MATCH but with SMARTS already interpreted.) *****************************************************************************/ int matchpat(dt_Handle mol,dt_Handle pattern) { dt_Handle paths; int j; if (NULL_OB==pattern) return 0; paths = dt_umatch(pattern,mol,0); j = dt_count(paths,TYP_PATH); dt_dealloc(paths); return ((j<0) ? 0 : j); } /***************************************************************************** *** initialize - Load/define SMIRKS, SMARTS. *****************************************************************************/ int initialize(void) { FILE *fbadsmarts; char *smarts,*p; int len; int badpatsize=10; srand((unsigned)time(NULL)); /*** randomize for rand_id() ***/ badpat = (dt_Handle *)malloc(badpatsize*sizeof(dt_Handle)); pat_hbd=dt_smartin(strlen(SMARTS_HBD),SMARTS_HBD); pat_hba=dt_smartin(strlen(SMARTS_HBA),SMARTS_HBA); pat_rb=dt_smartin(strlen(SMARTS_RB),SMARTS_RB); if (pat_hbd==NULL_OB||pat_hba==NULL_OB||pat_rb==NULL_OB) { printf("ERROR (%s): dt_smartin failure.\n",PROG); return (0); } xfrm_join1=dt_smirkin(strlen(SMIRKS_JOIN1),SMIRKS_JOIN1); xfrm_join2=dt_smirkin(strlen(SMIRKS_JOIN2),SMIRKS_JOIN2); xfrm_join3=dt_smirkin(strlen(SMIRKS_JOIN3),SMIRKS_JOIN3); xfrm_add_h=dt_smirkin(strlen(SMIRKS_ADD_H),SMIRKS_ADD_H); if (xfrm_join1==NULL_OB||xfrm_join2==NULL_OB||xfrm_join3==NULL_OB|| xfrm_add_h==NULL_OB) { printf("ERROR (%s): dt_smirkin failure.\n",PROG); return (0); } /********************************************************** *** Read and interpret SMARTS from file. **********************************************************/ if (badsmarts) { if (!(fbadsmarts=fopen(badsmarts,"r"))) { fprintf(stderr, "Error opening \"%s\"\n",badsmarts); return(0); } for (nbadsmarts=0;NULL!=(smarts=du_fgetline(&len,fbadsmarts));) { if (NULL!=(p=strchr(smarts,' '))) *p='\0'; if (badpatsize<(nbadsmarts+1)) { badpatsize*=2; badpat=(dt_Handle *)realloc(badpat,badpatsize*sizeof(dt_Handle)); } if (NULL_OB==(badpat[nbadsmarts]=dt_smartin(strlen(smarts),smarts))) { printf("ERROR (%s): Invalid SMARTS: \"%s\"\n",PROG,smarts); continue; } ++nbadsmarts; } printf("NOTE (%s): %d SMARTS read...\n",PROG,nbadsmarts); } /********************************************************** *** Open frags and cfrags files. **********************************************************/ if (!(ffrags=fopen(fragsfile,"r"))) { printf("ERROR(%s): Error opening \"%s\"\n",PROG,fragsfile); return(0); } if (!(fcfrags=fopen(cfragsfile,"r"))) { printf("ERROR(%s): Error opening \"%s\"\n",PROG,cfragsfile); return(0); } if (seedfile && !(fseed=fopen(seedfile,"r"))) { printf("ERROR(%s): Error opening \"%s\"\n",PROG,seedfile); return(0); } return(1); } /***************************************************************************** *** load_fragments - Read and load FRAGs,CFRAGs from files. *** FRAGs/CFRAGs are "generation 0", not part of the population really. *** Also, assign cumulative probability arrays. *****************************************************************************/ int load_fragments(void) { int i,id,len; char *p,*frag; dt_Handle tdt_h; /****************************************************************** *** Read and load FRAGs from file. IDs are odd ints [1,3..]. ******************************************************************/ for (i=0;NULL!=(frag=du_fgetline(&len,ffrags));) { ++i; p=strchr(frag,' '); /** 1st field is smi [2nd is frequency]. **/ if (p) *p='\0'; id = (2*i)-1; sprintf(tdtstr,"$SMI<%s>STATUS$ID<%d>FRAGINFO<%s;%s>|", frag,id,p+1,fragsfile); tdt_h = dt_thor_str2tdt(db,strlen(tdtstr),tdtstr,1); dt_thor_tdtput(tdt_h,DX_THOR_OVERWRITE); dt_dealloc(tdt_h); } maxfragid=id; printf("FRAGs count: %d.\n",fragcount=i); printf("FRAG IDs are odd ints from 1 to %d.\n",id); /************************************************** *** Assign probability-array w/ FRAG-frequencies. **************************************************/ if (!nofavorites) { rewind(ffrags); sumf = (int *)malloc((fragcount+1)*sizeof(int)); sumf[i=0]=0; for (i=0;NULL!=(frag=du_fgetline(&len,ffrags));) { ++i; p=strchr(frag,' '); /** 2nd field is frequency. **/ sumf[i] = sumf[i-1]+atoi(p+1); } sumf[0]=i; /*** sumf[0] is highest index ***/ printf("DEBUG: sumf[0]=%d,sumf[%d]=%d.\n",sumf[0],i,sumf[i]); } else { sumf = (int *)malloc(sizeof(int)); sumf[0] = fragcount; /*** needed by rand_id ***/ } close(ffrags); /****************************************************************** *** Read and load CFRAGs from file. IDs are even ints [0,2..]. ******************************************************************/ for (i=0;NULL!=(frag=du_fgetline(&len,fcfrags));) { ++i; p=strchr(frag,' '); /** 1st field is smi [2nd is frequency]. **/ if (p) *p='\0'; id = 2*(i-1); sprintf(tdtstr,"$SMI<%s>STATUS$ID<%d>FRAGINFO<%s;%s>|", frag,id,p+1,cfragsfile); tdt_h = dt_thor_str2tdt(db,strlen(tdtstr),tdtstr,1); dt_thor_tdtput(tdt_h,DX_THOR_OVERWRITE); dt_dealloc(tdt_h); } maxcfragid=id; printf("CFRAGs count: %d.\n",cfragcount=i); printf("CFRAG IDs are even ints from 0 to %d.\n",id); /************************************************** *** Assign probability-array w/ CFRAG-frequencies. **************************************************/ if (!nofavorites) { rewind(fcfrags); sumc = (int *)malloc((cfragcount+1)*sizeof(int)); sumc[i=0]=0; for (i=0;NULL!=(frag=du_fgetline(&len,fcfrags));) { ++i; p=strchr(frag,' '); /** 2nd field is frequency. **/ sumc[i] = sumc[i-1]+atoi(p+1); } sumc[0]=i; /*** sumc[0] is highest index ***/ printf("DEBUG: sumc[0]=%d,sumc[%d]=%d.\n",sumc[0],i,sumc[i]); } else { sumc = (int *)malloc(sizeof(int)); sumc[0] = cfragcount; /*** needed by rand_id ***/ } close(fcfrags); /********************************************************** *** Read and load seeds (UMOLs) from file. Fragids are *** consecutive ints from previous max, used only for *** FRAGLIST-keeping purposes. **********************************************************/ if (seedfile) { id = MAX(maxfragid,maxcfragid); for (i=0;NULL!=(frag=du_fgetline(&len,fseed));) { ++i; p=strchr(frag,' '); /** 1st field is smi [2nd is frequency]. **/ if (p) *p='\0'; sprintf(tdtstr,"$SMI<%s>STATUSFRAGLIST<%d;%s>|",frag,++id,frag); tdt_h = dt_thor_str2tdt(db,strlen(tdtstr),tdtstr,1); dt_thor_tdtput(tdt_h,DX_THOR_OVERWRITE); dt_dealloc(tdt_h); } close(fcfrags); printf("Initial UMOLs count: %d.\n",UMOL[0]=UMOL[1]=i); } return(1); } /***************************************************************************** *** RAND_ID - return a random ID number *** *** mode: 0 -> even [0...2n] *** 1 -> odd [1..2n+1] *** *** nofavorites: TRUE -> unbiassed random selection *** FALSE -> probability proportional to frag frequency *** *** sum[]: sum[0] == n == # of fragments == possible return values *** if (nofavorites) sum[1..n] == cumulative fragment frequencies *** ****************************************************************************** *** Cumulative Probability Array method: Arrays are derived from the *** fragment frequency data. sum[1] == frequency of frag #1, *** sum[i] == sum[i-1] + frequency(i). One improvement might be *** to use a binary search to locate the correct index, but since *** frag frequencies are pre-ordered with most frequent first, and *** frequencies are typically very skewed, the linear search may be *** fine or even better. *** Thank you RAS! *****************************************************************************/ int rand_id(int mode,int nofavorites,int sum[]) { float r; int i; int n=sum[0]; /*** n = highest index = # frags ***/ r = rand(); /*** 0 to RAND_MAX (maybe 32k) ***/ if (nofavorites) { r = (r/RAND_MAX) * (float) n; /*** 0<=r= len) { memmove(smi,p2,len=q2-p2); } q=q2; } smi[len]='\0'; return(smi); }