/* -*-C-*- */ /********************************************************************** * This source code is public domain, and may be freely distributed, * * modified, and used for any purpose. IT COMES WITH ABSOLUTELY NO * * WARRANTY OF ANY KIND. * **********************************************************************/ /*+====================================================================== | FILE: findpath.c | DESCRIPTION: | Looks recursively through THOR cross-references to find | a sequence of reactions back to available starting materials. | | Demonstrates the use of cross-references for finding reaction | schemes. Note that this program does not keep the multi-step | schemes found. They are simply printed out and discarded. | | Options: | -s -- skips non-organic reactants (just looks for any | carbons in the molecule). This significantly prunes | the search tree. | -h or -help -- Prints usage message and quits. | +====================================================================== */ #include #include #include #include "dt_smiles.h" #include "dt_thor.h" #include "du_utils.h" #define MX_SMILES 8000 #define MX_DEPTH 20 #define MX_XREF 20 static char blankarray[MX_DEPTH * 2]; dt_Handle rdb, mdb, pmoldt, rmoldt, rsmidt, msmidt, treeseq; int depth, leafs; int skip_nonorg = 0; int recurse(int, char *, int); int no_org(int, char *); /*************************************************************************** * FUNCTION: usage * * DESCRIPTION: * * * RETURNS: (void) ***************************************************************************/ void usage() { printf("SYNOPSIS:\n\n"); printf("findpath [options] [hostname [rxndb [moldb ] ] ]\n\n"); printf(" OPTIONS:\n"); printf(" -s ..... Skip non-organic molecules (no carbons).\n"); printf(" -h ..... Print this message and exit (-help works too).\n"); printf(" ARGS:\n"); printf(" hostname ......... thor server host.\n"); printf(" rxndb ............ reaction database name.\n"); printf(" moldb ............ molecule database name.\n"); printf("Enter SMILES on stdin, output to stdout.\n\n"); exit(1); } /*************************************************************************** * FUNCTION: zapseq * * RETURNS: (dt_Handle) ***************************************************************************/ void zapseq(dt_Handle seq) { dt_Handle member; dt_reset(seq); while (NULL_OB != (member = dt_next(seq))) dt_dealloc(member); dt_dealloc(seq); return; } /*************************************************************************** * FUNCTION: crash_and_burn * * RETURNS: (static void) ***************************************************************************/ static void crash_and_burn(dt_String s) { dt_errorsave("findpath", DX_ERR_ERROR, s); du_printerrors(stderr, 0); exit(1); } /*************************************************************************** * FUNCTION: write_rmol * * RETURNS: (static int) ***************************************************************************/ void write_rmol(dt_Handle dataitem, int level) { dt_Handle dfs, df; char *str, buf[MX_DEPTH * 2]; int lens; strncpy(buf, blankarray, level * 2 + 1); *(buf + (level * 2 + 1)) = '\0'; dfs = dt_stream(dataitem, TYP_DATAFIELD); df = dt_next(dfs); str = dt_thor_raw_datafield(&lens, df); /*** Print content ***/ if (NULL != str && 0 < lens) { printf("%sReactant: %.*s\n", buf, lens, str); if (!recurse(lens, str, level + 1)) { printf("%s Not accessible\n", buf); leafs++; } } dt_dealloc(dfs); return; } /*************************************************************************** * FUNCTION: recurse * * RETURNS: (int) ***************************************************************************/ int recurse(int plen, char *pmol, int level) { dt_Handle psos, strob, tree, subtrees, subtree, dataitems, dataitem; int oops, lens; char *str, buf[MX_DEPTH * 2]; strncpy(buf, blankarray, level * 2); *(buf + (level * 2)) = '\0'; depth = DU_MAX(depth, level); /*** Prune non-organic (OK, a loose definition I'll admit) cross-references ***/ if (skip_nonorg && no_org(plen, pmol)) { printf("%sMolecule is non-organic. Skipped.\n", buf); return (TRUE); } /*** Check to see if molecule is available. If so, we're done with this branch. ***/ tree = dt_thor_tdtget(mdb, msmidt, strlen(pmol), pmol, 0, &oops); if (tree != NULL_OB) { leafs++; printf("%sThis molecule is available.\n", buf); dt_append(treeseq, tree); return (TRUE); } /*** Punt if too deep already. ***/ if (level >= MX_DEPTH) { leafs++; printf("%sTerminating search at depth %d.\n", buf, level); return (TRUE); } /*** Check as a product. ***/ psos = dt_thor_xrefget(rdb, pmoldt, plen, pmol, &oops); if (NULL_OB == psos) return (FALSE); /*** Punt if too many xrefs. ***/ if (dt_count(psos, TYP_ANY) > MX_XREF) { leafs++; printf("%sToo many crossreferences. Punting.\n", buf); zapseq(psos); return (TRUE); } /*** zap the first in each (its the molecule itself), sort the remaining strings. ***/ strob = dt_next(psos); dt_delete(psos); dt_dealloc(strob); dt_reset(psos); while (NULL_OB != (strob = dt_next(psos))) { str = dt_stringvalue(&lens, strob); printf("%sReaction: %.*s\n", buf, lens, str); tree = dt_thor_tdtget(rdb, rsmidt, lens, str, 0, &oops); dt_delete(psos); dt_dealloc(strob); if (tree != NULL_OB) { if (dt_boolean(tree, 5, "found")) printf("%sAlready found.\n", buf); else { dt_setboolean(tree, 5, "found", TRUE); dt_append(treeseq, tree); subtrees = dt_stream(tree, TYP_DATATREE); while (NULL_OB != (subtree = dt_next(subtrees))) { dataitems = dt_stream(subtree, TYP_DATAITEM); while (NULL_OB != (dataitem = dt_next(dataitems))) if (dt_datatype(dataitem) == rmoldt) write_rmol(dataitem, level); dt_dealloc(dataitems); } dt_dealloc(subtrees); } } } dt_dealloc(psos); return (TRUE); } /*************************************************************************** * FUNCTION: main * * RETURNS: (int) ***************************************************************************/ main(int argc, char *argv[]) { dt_Handle server, tree; char host[100], rdbname[100], mdbname[100], pmol[MX_SMILES]; int i, args; /*** Blank array for indentation on output. ***/ for (i = 0; i < (MX_DEPTH * 2); blankarray[i++] = ' '); args = 1; if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "-help") == 0)) usage(); if (strcmp(argv[1], "-s") == 0) { skip_nonorg = 1; args++; } /**** Set host and database names from arguments or set defaults. ****/ gethostname(host, 100); strcpy(rdbname, "ccr972"); strcpy(mdbname, "acd972"); if (argc > args) { strcpy(host, argv[args]); args++; } if (argc > args) { strcpy(rdbname, argv[args]); args++; } if (argc > args) { strcpy(mdbname, argv[args]); args++; } /*** Make a sequence to hold reaction datatrees. ***/ treeseq = dt_alloc_seq(); /*** Connect to server ***/ server = dt_thor_connect(host, "5555", "thor", ""); fprintf(stderr, "Connected to %s, opening database...\n", host); /*** Open the reaction database ***/ rdb = dt_thor_open(server, rdbname, "r", ""); if (server == NULL_OB || rdb == NULL_OB) crash_and_burn("Can't open reaction database."); fprintf(stderr, "Reaction database %s opened.\n", rdbname); /*** Open the molecule database. ***/ mdb = dt_thor_open(server, mdbname, "r", ""); if (server == NULL_OB || mdb == NULL_OB) crash_and_burn("Can't open molecule database."); fprintf(stderr, "Molecule database %s opened.\n", mdbname); /**** Get the datatype objects ****/ rsmidt = dt_thor_getdatatype(rdb, 4, "$SMI"); if (NULL_OB == rsmidt) crash_and_burn("Can't find SMILES datatype in the reaction database!"); rmoldt = dt_thor_getdatatype(rdb, 5, "$RMOL"); if (NULL_OB == rmoldt) crash_and_burn("Can't find Reactant molecule datatype!"); pmoldt = dt_thor_getdatatype(rdb, 5, "$PMOL"); if (NULL_OB == pmoldt) crash_and_burn("Can't find Product molecule datatype!"); /*** Get the smiles datatype from the molecule db. ***/ msmidt = dt_thor_getdatatype(mdb, 4, "$SMI"); if (NULL_OB == msmidt) crash_and_burn("Can't find SMILES datatype in the molecule database!"); /**** Read from input, print TDTs to output ****/ while (gets(pmol)) { fprintf(stderr, "\n\nTarget molecule: %s\n", pmol); /*** Then, get the reactions ***/ leafs = depth = 0; if (!recurse(strlen(pmol), pmol, 0)) { fprintf(stderr, "d: %02d l: %04d %s\n", -1, 0, pmol); printf("Not accessible.\n"); leafs++; } else fprintf(stderr, "d: %02d l: %04d %s\n", depth, leafs, pmol); dt_reset(treeseq); while (NULL_OB != (tree = dt_next(treeseq))) { dt_delete(treeseq); dt_dealloc(tree); /*** Alternately: dt_setboolean(tree, 5, "found", FALSE); ***/ } } /*** Be polite to thor server; close connection before exiting. ***/ dt_dealloc(treeseq); dt_dealloc(rdb); dt_dealloc(mdb); dt_dealloc(server); fprintf(stderr, "count: %d\n", dt_vh_count()); return(0); } /*************************************************************************** * FUNCTION: no_org * * DESCRIPTION: * Take a SMILES string on input. Return TRUE or FALSE depending * on whether or not it is 'organic'. Organic simply means that it * contains carbon for this case. Simple lexical analyzer looks * for 'C' or 'c', excluding things like 'Cr', 'Tc', etc. * * RETURNS: (int) ***************************************************************************/ int no_org(int slen, char *str) { char *p, *end; if ((str == NULL) || (slen < 1)) return TRUE; end = str + slen; p = str; while (p < end) { if (*p == 'C') { p++; if (p == end) return (FALSE); /* last character 'C' */ if ((*p != 'a') && (*p != 'l') && (*p != 'r') && (*p != 'o') && (*p != 'u') && (*p != 'd') && (*p != 'e') && (*p != 'f') && (*p != 'm')) return (FALSE); /* C, not a two-letter atom */ } if (*p == 'c') { if (p == str) return (FALSE); /* first character 'c' */ if ((*(p-1) != 'S') && (*(p-1) != 'T') && (*(p-1) != 'A')) return (FALSE); /* not Tc, Sc, Ac */ } p++; } return TRUE; }