00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "includes.h"
00024
00025
00026 #define COUNTSMARTS_VERSION "0.1"
00027 #define COUNTSMARTS_DATE "11-JUN-2004"
00028
00029
00030
00031 int main( int argc, char *argv[] );
00032
00033 void parseClp( const int argc, char* argv[], CLP_Ptr clpPtr );
00034
00035 List_Ptr readDataFromFile( const char* const filename, const int transform, const int uniqueData, \
00036 FILE* errorLogFile );
00037
00038 List_Ptr readDataFromStream( FILE* inputStream, const char* const nameOfStream, \
00039 const int daylight_type, const int uniqueData, FILE* errorLogFile );
00040
00041 DoubleArrayPtr getFileProperties( FILE* inputStream, const char* const nameOfStream );
00042
00043 void displayHelpText( );
00044
00045 void displayVersionInformation( void );
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060 int main ( int argc, char *argv[] )
00061 {
00062 CLP_Ptr clpPtr;
00063 FILE* outFile;
00064 FILE* errorLogFile;
00065 List_Ptr inputDataPtr;
00066 List_Ptr smartsDataPtr;
00067 SmilesCompound_Ptr currentSmartsPtr;
00068 SmilesCompound_Ptr currentSmilesPtr;
00069 int totalNumberOfCompounds = 0;
00070 int matchcounter = 0;
00071
00072
00073 dt_Handle errorSequenceHandle;
00074 dt_Handle errorHandle;
00075 dt_Integer length;
00076
00077 dt_Handle currentSmartsHandle;
00078
00079 dt_Handle currentSmilesHandle;
00080
00081 dt_Handle pathset;
00082
00083 dt_Handle pathStream;
00084 dt_Handle path;
00085
00086
00087 clpPtr = CLP_create( );
00088 parseClp( argc, argv, clpPtr );
00089
00090 fprintf( stderr, "\ncountSmarts %s ( %s )\n\n", COUNTSMARTS_VERSION, COUNTSMARTS_DATE );
00091 CLP_display( clpPtr, stderr );
00092
00093
00094 if( !( outFile = fopen( CLP_getOutputFile( clpPtr ), "w" ) ) )
00095 FileWriteError( CLP_getOutputFile( clpPtr ) );
00096
00097
00098 if( !( errorLogFile = fopen( CLP_getErrorLogFile( clpPtr ), "w" ) ) )
00099 FileWriteError( CLP_getErrorLogFile( clpPtr ) );
00100
00101
00102 fprintf( stderr, "Reading file from stdin......." );
00103 inputDataPtr = readDataFromStream( stdin, "<stdin>", DAYLIGHT_SMILES, \
00104 CLP_getUniqueSmiles( clpPtr ), errorLogFile );
00105 List_setName( inputDataPtr, "SMILESFile" );
00106 List_rewind( inputDataPtr );
00107 fprintf( stderr, "Done\n" );
00108 while( List_hasNext( inputDataPtr ) )
00109 totalNumberOfCompounds += SmilesCompound_getCounter( List_getNext( inputDataPtr ) );
00110 fprintf( stderr, "Total number of compounds in input file: %d\n", totalNumberOfCompounds );
00111
00112 if( CLP_getUniqueSmiles( clpPtr ) == BOOLEAN_TRUE )
00113 fprintf( stderr, "Number of unique compounds in input file: %d\n\n", \
00114 List_getNumberOfNodes( inputDataPtr ) );
00115
00116
00117 fprintf( stderr, "Reading SMARTES file %s...", CLP_getSmartsFile( clpPtr ) );
00118 smartsDataPtr = readDataFromFile( CLP_getSmartsFile( clpPtr ), DAYLIGHT_SMARTS, \
00119 CLP_getUniqueSmiles( clpPtr ), errorLogFile );
00120 List_setName( smartsDataPtr, "SMARTSFile" );
00121 fprintf( stderr, "Done\n" );
00122 fprintf( stderr, "Number of SMARTS: %d\n", List_getNumberOfNodes( smartsDataPtr ) );
00123
00124
00125 fprintf( outFile, "#ID" );
00126 List_rewind( smartsDataPtr );
00127 while( List_hasNext( smartsDataPtr ) )
00128 {
00129 currentSmartsPtr = List_getNext( smartsDataPtr );
00130 if( StringArray_getNumberOfElements( SmilesCompound_getStringArrayOfNames( currentSmartsPtr ) ) > 0 )
00131 fprintf( outFile, "\t%s", StringArray_getElement( SmilesCompound_getStringArrayOfNames( \
00132 currentSmartsPtr ), 0 ) );
00133 else
00134 fprintf( outFile, "\t%s", SmilesCompound_getSmiles( currentSmartsPtr ) );
00135 }
00136 fprintf( outFile, "\n" );
00137
00138
00139 List_rewind( inputDataPtr );
00140 List_rewind( smartsDataPtr );
00141 fprintf( stderr, "Counting SMARTS in SMILES..." );
00142
00143 while( List_hasNext( inputDataPtr ) )
00144 {
00145 currentSmilesPtr = List_getNext( inputDataPtr );
00146 currentSmilesHandle = SmilesCompound_getMoleculeHandle( currentSmilesPtr );
00147 fprintf( outFile, "%s", StringArray_getElement( \
00148 SmilesCompound_getStringArrayOfNames(currentSmilesPtr), 0 ) );
00149
00150
00151 while( List_hasNext( smartsDataPtr ) )
00152 {
00153
00154 matchcounter = 0;
00155 currentSmartsPtr = List_getNext(smartsDataPtr);
00156 currentSmartsHandle = SmilesCompound_getMoleculeHandle(currentSmartsPtr);
00157
00158
00159 switch( CLP_getMatchingType( clpPtr ) )
00160 {
00161 case 1:
00162 pathset = dt_match( currentSmartsHandle, currentSmilesHandle, 0 );
00163 break;
00164 case 2:
00165 pathset = dt_umatch( currentSmartsHandle, currentSmilesHandle, 0 );
00166 break;
00167 case 3:
00168 pathset = dt_xmatch( currentSmartsHandle, currentSmilesHandle, 0 );
00169 break;
00170 default:
00171 pathset = dt_umatch( currentSmartsHandle, currentSmilesHandle, 0 );
00172 break;
00173 }
00174
00175 if( pathset != NULL_OB )
00176 {
00177 pathStream = dt_stream( pathset, TYP_PATH );
00178 while( NULL_OB != ( path = dt_next( pathStream ) ) )
00179 matchcounter++;
00180 dt_dealloc( pathset );
00181 }
00182 fprintf( outFile, "\t%d", matchcounter );
00183 }
00184
00185 fprintf(outFile, "\n");
00186 List_rewind( smartsDataPtr );
00187
00188 }
00189
00190 fprintf( stderr, "Done\n" );
00191
00192
00193 errorSequenceHandle = dt_errors( DX_ERR_NONE );
00194 while( NULL_OB != ( errorHandle = dt_next( errorSequenceHandle ) ) )
00195 fprintf( errorLogFile, "%s\n", dt_stringvalue( &length, errorHandle ) );
00196
00197
00198 List_destroy( inputDataPtr );
00199 List_destroy( smartsDataPtr );
00200 fclose( outFile );
00201 CLP_destroy( clpPtr );
00202 fprintf( stderr, "Adios.\n\n");
00203 return EXIT_SUCCESS;
00204 }
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220 void parseClp( const int argc, char* argv[], CLP_Ptr clpPtr )
00221 {
00222 int c;
00223 int matchingType;
00224 char* errorLogFile;
00225 Given_CLP_Ptr givenClpPtr;
00226
00227 givenClpPtr = GivenClp_create( );
00228
00229 opterr = 0;
00230
00231 while ( ( c = getopt( argc, argv, "hvuo:s:m:" ) ) != -1 )
00232 {
00233 switch( c )
00234 {
00235
00236 case 'h':
00237 displayHelpText( );
00238 break;
00239
00240 case 'v':
00241 displayVersionInformation();
00242 break;
00243
00244 case 'o':
00245 CLP_setOutputFile( clpPtr, optarg );
00246 GivenClp_setOption( givenClpPtr, 'o', BOOLEAN_TRUE );
00247 break;
00248
00249 case 's':
00250 CLP_setSmartsFile( clpPtr, optarg );
00251 GivenClp_setOption( givenClpPtr, 's', BOOLEAN_TRUE );
00252 break;
00253 case 'm':
00254 sscanf( optarg, "%d", &matchingType );
00255 CLP_setMatchingType( clpPtr, matchingType );
00256 GivenClp_setOption( givenClpPtr, 'm', BOOLEAN_TRUE );
00257 break;
00258 case 'u':
00259 CLP_setUniqueSmiles( clpPtr, BOOLEAN_TRUE );
00260 GivenClp_setOption( givenClpPtr, 'u', BOOLEAN_TRUE );
00261 break;
00262 case '?':
00263 fprintf( stderr, "\n\nERROR: There is given either an option without an argument that\n" );
00264 fprintf( stderr, "requires one or an unknown option!\n" );
00265 fprintf( stderr, "Type 'countSmarts -h' for a detailed help text!\n");
00266 AbortProgram;
00267 break;
00268 }
00269 }
00270
00271
00272 if( isatty( STDIN_FILENO ) )
00273 displayHelpText( clpPtr );
00274
00275
00276 if( GivenClp_getOption( givenClpPtr, 'o' ) == BOOLEAN_FALSE )
00277 MandatoryOption( "'-o'" );
00278
00279
00280 if( GivenClp_getOption( givenClpPtr, 's' ) == BOOLEAN_FALSE )
00281 MandatoryOption( "'-s'" );
00282
00283
00284 if( !( errorLogFile = calloc( strlen( CLP_getOutputFile( clpPtr ) ) + 5, sizeof( char ) ) ) )
00285 MemoryError( "parseClp", "errorLogFile" );
00286 strncpy( errorLogFile, CLP_getOutputFile( clpPtr ), strlen( CLP_getOutputFile( clpPtr ) ) + 1 );
00287 CLP_setErrorLogFile( clpPtr, strncat( errorLogFile, ".log", 4 ) );
00288 if( errorLogFile != NULL )
00289 free( errorLogFile );
00290
00291 GivenClp_destroy( givenClpPtr );
00292 }
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308 List_Ptr readDataFromFile( const char* const filename, const int daylight_type, const int uniqueData, \
00309 FILE* errorLogFile )
00310 {
00311 FILE* dataFile;
00312
00313
00314 if( !( dataFile = fopen( filename, "r" ) ) )
00315 FileReadError( filename );
00316
00317
00318 return readDataFromStream( dataFile, filename, daylight_type, uniqueData, errorLogFile );
00319 }
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332
00333
00334
00335
00336
00337
00338
00339 List_Ptr readDataFromStream( FILE* inputStream, const char* const nameOfStream, const int daylight_type, \
00340 const int uniqueData, FILE* errorLogFile )
00341 {
00342 int numberOfColumns;
00343
00344 int maxNumberOfCharsPerRow;
00345 int numberOfRows;
00346 char* tempRow;
00347 char* substring;
00348 char* tempSmiles;
00349 char* tempName = NULL;
00350 DoubleArrayPtr dAPtr;
00351 List_Ptr listPtr;
00352 SmilesCompound_Ptr scPtr;
00353 char* errorMessage;
00354
00355 StringArray_Ptr sAPtr;
00356
00357
00358 dAPtr = getFileProperties( inputStream, nameOfStream );
00359
00360 rewind( inputStream );
00361 numberOfColumns = (int) DoubleArray_getValue( dAPtr, 0 );
00362 maxNumberOfCharsPerRow = (int) DoubleArray_getValue( dAPtr, 1 );
00363 numberOfRows = (int) DoubleArray_getValue( dAPtr, 2 );
00364
00365 #if COUNTSMARTS_DEBUG
00366 fprintf( stdout, "\nNoColumns = %d\tMaxNoChars = %d\tNoRows = %d\n", \
00367 numberOfColumns, maxNumberOfCharsPerRow, numberOfRows );
00368 #endif
00369
00370 if( !( tempRow = calloc( maxNumberOfCharsPerRow + 2, sizeof( char ) ) ) )
00371 MemoryError( "tempRow", "readDataFromStream" );
00372 if( !( tempSmiles = calloc( maxNumberOfCharsPerRow + 2, sizeof( char ) ) ) )
00373 MemoryError( "tempSmiles", "readDataFromStream" );
00374 if( !( tempName = calloc( maxNumberOfCharsPerRow + 2, sizeof( char ) ) ) )
00375 MemoryError( "tempName", "readDataFromStream" );
00376 listPtr = List_create( uniqueData, SmilesCompound_display, SmilesCompound_destroy, \
00377 SmilesCompound_identical );
00378 sAPtr = StringArray_create( );
00379
00380
00381
00382 while( fgets( tempRow, maxNumberOfCharsPerRow + 2, inputStream ) != NULL )
00383 {
00384 #if COUNTSMARTS_DEBUG
00385 fprintf( stdout, "tempRow = %s", tempRow ); fflush( NULL );
00386 #endif
00387
00388
00389 if( numberOfColumns == 1 )
00390 {
00391 strncpy( tempSmiles, tempRow, strlen( tempRow ) - 1 );
00392 tempSmiles[ strlen( tempRow ) - 1 ] = '\0';
00393 strncpy( tempName, tempRow, strlen( tempRow ) - 1 );
00394 tempName[ strlen( tempRow ) - 1 ] = '\0';
00395 }
00396
00397 else
00398 {
00399 substring = strstr( tempRow, "\t" );
00400 strncpy( tempSmiles, tempRow, ( substring - tempRow ) );
00401 tempSmiles[ substring - tempRow ] = '\0';
00402 strncpy( tempName, substring + 1, ( tempRow + strlen( tempRow ) - 1 ) - ( substring + 1 ) );
00403 tempName[ ( tempRow + strlen( tempRow ) - 1 ) - ( substring + 1 ) ] = '\0';
00404 }
00405 #if COUNTSMARTS_DEBUG
00406 fprintf( stdout, "Smiles: %s\tName: %s\n", tempSmiles, tempName ); fflush( NULL );
00407 #endif
00408
00409
00410 scPtr = SmilesCompound_create( tempName, tempSmiles, daylight_type, errorLogFile );
00411
00412 if( scPtr != BOOLEAN_FALSE )
00413 {
00414 if( List_insertTail( listPtr, scPtr ) == BOOLEAN_FALSE )
00415 SmilesCompound_destroy( scPtr );
00416 }
00417 else
00418 {
00419 if( !( errorMessage = calloc( 300, sizeof( char ) ) ) )
00420 MemoryError( "errorMessage", "readDataFromStream" );
00421 sprintf( errorMessage, "Error reading compound %s\n", tempName );
00422 StringArray_addElement( sAPtr, errorMessage );
00423 }
00424 }
00425
00426 if( StringArray_getNumberOfElements( sAPtr ) != 0 )
00427 StringArray_display( sAPtr, stderr );
00428
00429
00430 free( tempRow );
00431 free( tempSmiles );
00432 free( tempName );
00433 StringArray_destroy( sAPtr );
00434
00435 return listPtr;
00436 }
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458 DoubleArrayPtr getFileProperties( FILE* inputStream, const char* const nameOfStream )
00459 {
00460 int numberOfColumns = 1;
00461 int maxCharsPerRow = 0;
00462 int numberOfRows = 0;
00463 char currentCharacter;
00464 int position;
00465 char* tempRow;
00466 char* substring;
00467 int counter = 1;
00468
00469
00470
00471
00472
00473 DoubleArrayPtr dAPtr;
00474
00475 if( !( tempRow = calloc( 10000, sizeof( char ) ) ) )
00476 MemoryError( "tempRow", "getFileProperties" );
00477
00478 dAPtr = DoubleArray_create( 3 );
00479
00480
00481 while( ( currentCharacter = fgetc( inputStream ) ) != EOF )
00482 {
00483
00484 ungetc( currentCharacter, inputStream );
00485
00486
00487 position = 0;
00488 counter = 1;
00489
00490
00491 while( ( currentCharacter = fgetc( inputStream ) ) != '\n' )
00492 {
00493 tempRow[ position ] = currentCharacter;
00494 position++;
00495 }
00496 tempRow[ position ] = '\0';
00497 numberOfRows++;
00498
00499
00500 substring = strstr( tempRow, "\t" );
00501 while( substring != NULL)
00502 {
00503 counter++;
00504 substring = strstr( substring + 1, "\t" );
00505 }
00506
00507
00508 if( numberOfRows == 1 )
00509 numberOfColumns = counter;
00510
00511
00512 if( counter != numberOfColumns )
00513 FileIntegrityError( nameOfStream, numberOfRows, counter, numberOfColumns );
00514
00515
00516 if( strlen( tempRow ) > maxCharsPerRow )
00517 maxCharsPerRow = position;
00518 }
00519
00520
00521 if( numberOfColumns > 2 )
00522 {
00523 fprintf( stderr, "\nERROR: The number of columns in %s is neither one nor two!\n", nameOfStream );
00524 fprintf( stderr, "Only input files with the name in the first column and the SMILES string\n" );
00525 fprintf( stderr, "in the second column or the SMILES string as the only column are valid!\n" );
00526 AbortProgram;
00527 }
00528
00529
00530 DoubleArray_setValue( dAPtr, 0, numberOfColumns );
00531 DoubleArray_setValue( dAPtr, 1, maxCharsPerRow );
00532 DoubleArray_setValue( dAPtr, 2, numberOfRows );
00533
00534 free( tempRow );
00535
00536 return dAPtr;
00537 }
00538
00539
00540
00541
00542
00543
00544 void displayHelpText( )
00545 {
00546 fprintf( stderr, "\nNAME:\n" );
00547 fprintf( stderr, " countSmarts\n" );
00548 fprintf( stderr, "\nFUNCTION:\n" );
00549 fprintf( stderr, " Counts the occurrences of SMARTS in a SMILES.\n" );
00550 fprintf( stderr, "\nUSAGE:\n" );
00551 fprintf( stderr, " countSmarts [Options] <INFILE\n" );
00552 fprintf( stderr, " INFILE has to be a file with one SMILES per row or a tab separated file\n" );
00553 fprintf( stderr, " with SMILES in the first and labels in the second column.\n" );
00554 fprintf( stderr, "\nREMARK:\n" );
00555 fprintf( stderr, " An error log file is automatically created. Its name is the suffix '.log'\n" );
00556 fprintf( stderr, " appended to the name of the output file.\n" );
00557 fprintf( stderr, "\nOPTIONS:\n" );
00558 fprintf( stderr, " -h\n" );
00559 fprintf( stderr, " Display this help text.\n" );
00560 fprintf( stderr, " -v\n" );
00561 fprintf( stderr, " Display detailed version information and exit.\n" );
00562 fprintf( stderr, " -o FILENAME @ MANDATORY\n" );
00563 fprintf( stderr, " Set FILENAME as the name of the output file.\n" );
00564 fprintf( stderr, " -s FILENAME @ MANDATORY\n" );
00565 fprintf( stderr, " Set FILENAME as the name of the SMARTS file (same format as the INFILE).\n" );
00566 fprintf( stderr, " -m INTEGER @ [1,3] @ DEFAULT = 2\n" );
00567 fprintf( stderr, " Specifies the SMARTS match type:\n" );
00568 fprintf( stderr, " 1 = exhaustive search [dt_match,0]\n" );
00569 fprintf( stderr, " 2 = unique set-of-atoms in results [dt_umatch,0])\n" );
00570 fprintf( stderr, " 3 = each atom appears in exactly one result [dt_xmatch,0]\n" );
00571 fprintf( stderr, " -u\n" );
00572 fprintf( stderr, " Only unique SMILES are written to the output file.\n" );
00573 fprintf( stderr, "\nAUTHOR:\n" );
00574 fprintf( stderr, " Uli Fechner\n" );
00575 fprintf( stderr, "\nVERSION & RELEASE DATE:\n" );
00576 fprintf( stderr, " %s ( %s )\n", COUNTSMARTS_VERSION, COUNTSMARTS_DATE );
00577 fprintf( stderr, "\nBUGS:\n" );
00578 fprintf( stderr, " Please report bugs to u.fechner@chemie.uni-frankfurt.de\n\n" );
00579 exit( EXIT_SUCCESS );
00580 }
00581
00582
00583
00584
00585
00586
00587 void displayVersionInformation( void )
00588 {
00589 fprintf( stderr, "\nVERSION & RELEASE DATE:\n\n");
00590 fprintf( stderr, " includes.h: %s [%s]\n", INCLUDES_VERSION, INCLUDES_DATE );
00591 fprintf( stderr, " generalDefines.h: %s [%s]\n", GENERALDEFINES_VERSION, GENERALDEFINES_DATE );
00592 fprintf( stderr, " givenClp.c: %s [%s]\n", GIVENCLP_VERSION, GIVENCLP_DATE );
00593 fprintf( stderr, " doubleArray.c: %s [%s]\n", DOUBLEARRAY_VERSION, DOUBLEARRAY_DATE );
00594 fprintf( stderr, " clp.c: %s [%s]\n", CLP_VERSION, CLP_DATE );
00595 fprintf( stderr, " smilesCompound.c: %s [%s]\n", SMILESCOMPOUND_VERSION, SMILESCOMPOUND_DATE );
00596 fprintf( stderr, " doubleLinkedList.c: %s [%s]\n", DOUBLELINKEDLIST_VERSION, DOUBLELINKEDLIST_DATE );
00597 fprintf( stderr, " countSmarts core: %s [%s]\n\n", COUNTSMARTS_VERSION, COUNTSMARTS_DATE );
00598 exit( EXIT_SUCCESS );
00599 }