Main Page | Data Structures | File List | Data Fields | Globals

countSmarts.c

Go to the documentation of this file.
00001 /** @mainpage countSmarts
00002 * <H1><center><B>countSmarts</B></center></H1>
00003 * <p>
00004 * @b countSmarts counts the occurrences of SMARTS in a SMILES. Both the SMARTS and the SMILES have to
00005 * be provided as tab separated files with SMARTS/SMILES in the first column and an optional name in
00006 * the second column. The program also allows the removal of duplicate SMILES where duplicate means that
00007 * the unique SMILES are identical.
00008 * <p>
00009 * &copy; 2004 by Uli Fechner
00010 */
00011 
00012 /** @file
00013 *
00014 * This is the main file of the program countSmarts. All other files are included here via the inclusion
00015 * of the file @ref includes.h.
00016 *
00017 * @author Uli Fechner
00018 * @version 11/06/2004 - UF - 0.1 - initial stable release
00019 */
00020 
00021 /* preprocessor includes */
00022 
00023 #include "includes.h"
00024 
00025 /* just some information about this file */
00026 #define COUNTSMARTS_VERSION     "0.1"
00027 #define COUNTSMARTS_DATE        "11-JUN-2004"
00028 
00029 /* function prototypes */
00030 
00031 int main( int argc, char *argv[] );
00032 
00033 void parseClp( const int argc, char* argv[], CLP_Ptr clpPtr );
00034 
00035 List_Ptr readDataFromFile( const char* const filename, const int transform, const int uniqueData, \
00036         FILE* errorLogFile );
00037 
00038 List_Ptr readDataFromStream( FILE* inputStream, const char* const nameOfStream, \
00039         const int daylight_type, const int uniqueData, FILE* errorLogFile );
00040 
00041 DoubleArrayPtr getFileProperties( FILE* inputStream, const char* const nameOfStream );
00042 
00043 void displayHelpText( );
00044 
00045 void displayVersionInformation( void );
00046 
00047 /* functions */
00048 
00049 /** Main function of the program.
00050 *
00051 * The main function just calls the other functions. It is reduced to contain only function calls
00052 * whenever possible.
00053 *
00054 * @param argc the number of arguments provided via the command line by executing the program
00055 * @param *argv[] string array containing the command line arguments
00056 * @retval int exit code of the program
00057 * @author Kristina Grabowski
00058 * @version 20/02/2004 - Uli Fechner - 0.1 - initial stable release
00059 */
00060 int main ( int argc, char *argv[] )
00061 {
00062         CLP_Ptr clpPtr; /* pointer on CommandLineParameters */
00063         FILE* outFile; /* file pointer on the output file */
00064         FILE* errorLogFile; /* file pointer on the SMILES error log file */
00065         List_Ptr inputDataPtr; /* pointer on a structure List that stores the data of the input file */
00066         List_Ptr smartsDataPtr; /* pointer on a List that stores the data of the reaction smiles file */
00067         SmilesCompound_Ptr currentSmartsPtr; /* pointer on current reaction SMILES in OUTER LOOP */
00068         SmilesCompound_Ptr currentSmilesPtr; /* pointer on current input SMILES in INNER LOOP */
00069         int totalNumberOfCompounds = 0; /* total number of compounds in input file */
00070         int matchcounter = 0;   /* number of matched SMARTS for all SMARTS in the input SMART file */
00071         
00072         /***** Daylight Toolkit variables *****/
00073         dt_Handle errorSequenceHandle; /* dt_Handle on a stream that contains the error messages of the DT */
00074         dt_Handle errorHandle; /* dt_Handle on one error message of errorSequenceHandle */
00075         dt_Integer length; /* needed by dt_cansmiles, dt_typename and dt_stringvalue */
00076         /* Daylight Toolkit variables of the OUTER LOOP */
00077         dt_Handle currentSmartsHandle; /* dt_Handle on current reaction SMILES */
00078         /* Daylight Toolkit variables of the INNER LOOP */
00079         dt_Handle currentSmilesHandle; /* dt_Handle on the current input SMILES */
00080         /* dt_Handle on the pathset, each path in the pathset contains a unique set of atoms */
00081         dt_Handle pathset;
00082         /* dt_Handle on a stream object containing all of the parts of type 'TYP_PATH' in the object 'pathset'*/
00083         dt_Handle pathStream;
00084         dt_Handle path; /* dt_Handle on the current path */    
00085         
00086         /* creating a CommandLineParameters structure and initialize it with default values */
00087         clpPtr = CLP_create( );
00088         parseClp( argc, argv, clpPtr ); /* parsing the command line arguments */
00089         /* the command line arguments are shown on standard error */
00090         fprintf( stderr, "\ncountSmarts %s ( %s )\n\n", COUNTSMARTS_VERSION, COUNTSMARTS_DATE );
00091         CLP_display( clpPtr, stderr );
00092         
00093         /* opening the output file; if an error occurs the program aborts */
00094         if( !( outFile = fopen( CLP_getOutputFile( clpPtr ), "w" ) ) )
00095                 FileWriteError( CLP_getOutputFile( clpPtr ) );
00096         
00097         /* opening the SMILES error log file; if an error occurs the program aborts */
00098         if( !( errorLogFile = fopen( CLP_getErrorLogFile( clpPtr ), "w" ) ) )
00099                 FileWriteError( CLP_getErrorLogFile( clpPtr ) );
00100 
00101         /* reading the data from stdin */
00102         fprintf( stderr, "Reading file from stdin......." );
00103         inputDataPtr = readDataFromStream( stdin, "<stdin>", DAYLIGHT_SMILES, \
00104                 CLP_getUniqueSmiles( clpPtr ), errorLogFile );
00105         List_setName( inputDataPtr, "SMILESFile" );
00106         List_rewind( inputDataPtr );
00107         fprintf( stderr, "Done\n" );
00108         while( List_hasNext( inputDataPtr ) )
00109                 totalNumberOfCompounds += SmilesCompound_getCounter( List_getNext( inputDataPtr ) );
00110         fprintf( stderr, "Total number of compounds in input file: %d\n", totalNumberOfCompounds );
00111         /* print out the number of unique compounds if the option is set */
00112         if( CLP_getUniqueSmiles( clpPtr ) == BOOLEAN_TRUE )
00113                 fprintf( stderr, "Number of unique compounds in input file: %d\n\n", \
00114                 List_getNumberOfNodes( inputDataPtr ) );
00115 
00116         /* reading SMARTS (provided with '-s' option) */
00117         fprintf( stderr, "Reading SMARTES file %s...", CLP_getSmartsFile( clpPtr ) );
00118         smartsDataPtr = readDataFromFile( CLP_getSmartsFile( clpPtr ), DAYLIGHT_SMARTS, \
00119                 CLP_getUniqueSmiles( clpPtr ), errorLogFile );
00120         List_setName( smartsDataPtr, "SMARTSFile" );
00121         fprintf( stderr, "Done\n" );
00122         fprintf( stderr, "Number of SMARTS: %d\n", List_getNumberOfNodes( smartsDataPtr ) );
00123         
00124         /* printing the header row */
00125         fprintf( outFile, "#ID" );
00126         List_rewind( smartsDataPtr );
00127         while( List_hasNext( smartsDataPtr ) )
00128         {
00129                 currentSmartsPtr = List_getNext( smartsDataPtr );
00130                 if( StringArray_getNumberOfElements( SmilesCompound_getStringArrayOfNames( currentSmartsPtr ) ) > 0 )
00131                         fprintf( outFile, "\t%s", StringArray_getElement( SmilesCompound_getStringArrayOfNames( \
00132                                 currentSmartsPtr ), 0 ) );
00133                 else
00134                         fprintf( outFile, "\t%s", SmilesCompound_getSmiles( currentSmartsPtr ) );
00135         }
00136         fprintf( outFile, "\n" );
00137         
00138         /* rewinding both lists */
00139         List_rewind( inputDataPtr );
00140         List_rewind( smartsDataPtr );
00141         fprintf( stderr, "Counting SMARTS in SMILES..." );
00142         /***** OUTER LOOP: LOOPING OVER THE SMILES FILE *****/
00143         while( List_hasNext( inputDataPtr ) )
00144         {
00145                 currentSmilesPtr = List_getNext( inputDataPtr );
00146                 currentSmilesHandle = SmilesCompound_getMoleculeHandle( currentSmilesPtr );
00147                 fprintf( outFile, "%s", StringArray_getElement( \
00148                         SmilesCompound_getStringArrayOfNames(currentSmilesPtr), 0 ) );
00149 
00150                 /***** INNER LOOP: LOOPING OVER THE SMARTS FILE *****/  
00151                 while( List_hasNext( smartsDataPtr ) )
00152                 {
00153                         /* counts the matches for one type of SMARTS in the SMARTS file */
00154                         matchcounter = 0;
00155                         currentSmartsPtr = List_getNext(smartsDataPtr);
00156                         currentSmartsHandle = SmilesCompound_getMoleculeHandle(currentSmartsPtr);
00157                         
00158                         /* choose the correct SMARTS matching method specified via a command line parameter */
00159                         switch( CLP_getMatchingType( clpPtr ) )
00160                         {
00161                                 case 1:
00162                                         pathset = dt_match( currentSmartsHandle, currentSmilesHandle, 0 );
00163                                         break;
00164                                 case 2:
00165                                         pathset = dt_umatch( currentSmartsHandle, currentSmilesHandle, 0 );
00166                                         break;
00167                                 case 3:
00168                                         pathset = dt_xmatch( currentSmartsHandle, currentSmilesHandle, 0 );
00169                                         break;
00170                                 default:
00171                                         pathset = dt_umatch( currentSmartsHandle, currentSmilesHandle, 0 );
00172                                         break;
00173                         }
00174                         
00175                         if( pathset != NULL_OB )
00176                         { 
00177                                 pathStream = dt_stream( pathset, TYP_PATH );
00178                                 while( NULL_OB != ( path = dt_next( pathStream ) ) ) 
00179                                         matchcounter++;
00180                                 dt_dealloc( pathset ); /* cleaning up */
00181                         }
00182                         fprintf( outFile, "\t%d", matchcounter );
00183     }
00184                 /**** END OF INNER LOOP ****/
00185                 fprintf(outFile, "\n");
00186                 List_rewind( smartsDataPtr );
00187 
00188         }
00189         /**** END OF OUTER LOOP ****/
00190         fprintf( stderr, "Done\n" );
00191         
00192         /* writing the Daylight Toolkit errors to the error log file */
00193         errorSequenceHandle = dt_errors( DX_ERR_NONE );
00194         while( NULL_OB != ( errorHandle = dt_next( errorSequenceHandle ) ) )
00195                 fprintf( errorLogFile, "%s\n", dt_stringvalue( &length, errorHandle ) );
00196         
00197         /* cleaning up */
00198         List_destroy( inputDataPtr );
00199         List_destroy( smartsDataPtr );
00200         fclose( outFile );
00201         CLP_destroy( clpPtr );
00202         fprintf( stderr, "Adios.\n\n");
00203         return EXIT_SUCCESS;
00204 }
00205 
00206 /** Parses the command line parameters.
00207 *
00208 * The command line parameters are parsed here. Additionally, it is checked whether the standard in is
00209 * connected to a terminal. If this is the case, no input file is provided and the help text of the
00210 * program is printed on standard error. It is also checked here if there are any incompatibilities
00211 * of the options provided at the command line. If so, the program aborts with an appropriate error
00212 * message.
00213 *
00214 * @param argc the number of arguments provided via the command line by executing the program
00215 * @param *argv[] string array containing the command line arguments
00216 * @param clpPtr pointer on structure @ref CommandLineParameters that is used to store all the command line parameters
00217 * @author Kritsina Grabowski
00218 * @version 20/02/04 - Uli Fechner - initial release
00219 */
00220 void parseClp( const int argc, char* argv[], CLP_Ptr clpPtr )
00221 {       
00222         int c; /* gets the arguments via getopt */
00223         int matchingType; /* intermediate storage of the matching type as a number */
00224         char* errorLogFile; /* temporary storage of the error log file name */
00225         Given_CLP_Ptr givenClpPtr; /* pointer on structure GivenCommandLineParameters */        
00226 
00227         givenClpPtr = GivenClp_create( ); /* creating and initializing a structure GivenCommandLineParameters */
00228         
00229         opterr = 0; /* disable the error message getopt normaly prints on stderr */
00230         
00231         while ( ( c = getopt( argc, argv, "hvuo:s:m:" ) ) != -1 )
00232         {
00233                 switch( c )
00234                 {
00235                 /* checking if to provide the help text */
00236                 case 'h':
00237                         displayHelpText( );
00238                         break;
00239                 /* checking if to provide version information text */
00240                 case 'v':
00241                         displayVersionInformation();
00242                         break;
00243                 /* the argument of the argument o defines the name of the output file */
00244                 case 'o':
00245                         CLP_setOutputFile( clpPtr, optarg );
00246                         GivenClp_setOption( givenClpPtr, 'o', BOOLEAN_TRUE );
00247                         break;          
00248                 /* the argument of the argument s defines the name of the smartsfile */
00249                 case 's':
00250                         CLP_setSmartsFile( clpPtr, optarg );
00251                         GivenClp_setOption( givenClpPtr, 's', BOOLEAN_TRUE );
00252                         break;
00253                 case 'm':
00254                         sscanf( optarg, "%d", &matchingType );
00255                         CLP_setMatchingType( clpPtr, matchingType );
00256                         GivenClp_setOption( givenClpPtr, 'm', BOOLEAN_TRUE );
00257                         break;
00258                 case 'u':
00259                         CLP_setUniqueSmiles( clpPtr, BOOLEAN_TRUE );
00260                         GivenClp_setOption( givenClpPtr, 'u', BOOLEAN_TRUE );
00261                         break;
00262                 case '?':
00263                         fprintf( stderr, "\n\nERROR: There is given either an option without an argument that\n" );
00264                         fprintf( stderr, "requires one or an unknown option!\n" );
00265                         fprintf( stderr, "Type 'countSmarts -h' for a detailed help text!\n");
00266                         AbortProgram;
00267                         break;                                  
00268                 }
00269         }
00270         
00271         /* testing whether the standard in is connected to the terminal; if so display help */
00272         if( isatty( STDIN_FILENO ) )
00273                 displayHelpText( clpPtr );
00274 
00275         /* the '-o' argument is mandatory; if itisn't provided the program aborts with an appropriate message */
00276         if( GivenClp_getOption( givenClpPtr, 'o' ) == BOOLEAN_FALSE )
00277                 MandatoryOption( "'-o'" );
00278 
00279         /* the '-s' argument is mandatory; if itisn't provided the program aborts with an appropriate message */
00280         if( GivenClp_getOption( givenClpPtr, 's' ) == BOOLEAN_FALSE )
00281                 MandatoryOption( "'-s'" );
00282         
00283         /* generating the name of the error log file and storing it in the structure CommandLineParameters */
00284         if( !( errorLogFile = calloc( strlen( CLP_getOutputFile( clpPtr ) ) + 5, sizeof( char ) ) ) )
00285                 MemoryError( "parseClp", "errorLogFile" );
00286         strncpy( errorLogFile, CLP_getOutputFile( clpPtr ), strlen( CLP_getOutputFile( clpPtr ) ) + 1 );
00287         CLP_setErrorLogFile( clpPtr, strncat( errorLogFile, ".log", 4 ) );
00288         if( errorLogFile != NULL )
00289                 free( errorLogFile );
00290         
00291         GivenClp_destroy( givenClpPtr ); /* destroying structure GivenCommandLineParameters */
00292 }
00293 
00294 /** Reads the data of a file and stores it in a structure @ref List.
00295 * 
00296 * @attention
00297 * This is mainly a wrapper for function @ref readDataFromStream!
00298 *
00299 * @param filename the name of the file to read
00300 * @param daylight_type indicates the file type (@ref DAYLIGHT_SMILES, @ref DAYLIGHT_SMIRKS, @ref DAYLIGHT_SMARTS)
00301 * @param uniqueData if set to @ref BOOLEAN_TRUE input SMILES are filtered to yield unique SMILES
00302 * @param errorLogFile file pointer on the error log file
00303 * @retval List_Ptr points on structure @ref List containing the content of @c filename
00304 * @author Uli Fechner
00305 * @version 28/11/2003 - Uli Fechner - initial release
00306 * @version 09/06/2004 - UF - added new parameter @c uniqueData
00307 */
00308 List_Ptr readDataFromFile( const char* const filename, const int daylight_type, const int uniqueData, \
00309         FILE* errorLogFile )
00310 {
00311         FILE* dataFile; /* file pointer of the file with 'filename' */
00312 
00313         /* opening of the file and checking if this was successful */
00314         if( !( dataFile = fopen( filename, "r" ) ) )
00315                 FileReadError( filename );
00316         
00317         /* calling of the sub-function that actually reads the file */
00318         return readDataFromStream( dataFile, filename, daylight_type, uniqueData, errorLogFile );
00319 }
00320 
00321 /** Reads the data of a stream and stores it in a structure @ref List.
00322 *
00323 * @param inputStream file pointer of the stream to read
00324 * @param nameOfStream string indicating the name of the stream
00325 * @param daylight_type indicates the file type (@ref DAYLIGHT_SMILES, @ref DAYLIGHT_SMIRKS, @ref DAYLIGHT_SMARTS)
00326 * @param uniqueData if set to @ref BOOLEAN_TRUE input SMILES are filtered to yield unique SMILES
00327 * @param errorLogFile file pointer on the error log file
00328 * @retval List_Ptr points on structure @ref List containing the content of @c inputStream
00329 * @author Uli Fechner
00330 * @version 28/11/2003 - Uli Fechner - initial release
00331 * @version 08/12/2003 - Uli Fechner - considered unsuccessful creations of SmilesCompounds due to
00332 * invalid input SMILES; added error output
00333 * @version 09/06/2004 - UF - added new parameter @c uniqueData; bugfix: SMILES without a name were
00334 *                     assigned an empty string;
00335 * @version 11/06/2004 - UF - MAJOR change: from now on the first column is the SMILES and the second
00336 *                     column is the name (if any; before the 1st column was the name and the 2nd column
00337 *                     was the SMILES); this follows the common format of SMILES files
00338 */
00339 List_Ptr readDataFromStream( FILE* inputStream, const char* const nameOfStream, const int daylight_type, \
00340         const int uniqueData, FILE* errorLogFile )
00341 {
00342         int numberOfColumns; /* the number of columns in the file provided via 'inputStream' */
00343         /* maximum number of characters per column of the file provided via 'inputStream' */
00344         int maxNumberOfCharsPerRow;
00345         int numberOfRows; /* the number of rows in the file provided via 'inputStream' */
00346         char* tempRow; /* temporary storage of a row of 'inputStream' */
00347         char* substring; /* pointer needed by the algorithm that divides a line into columns */
00348         char* tempSmiles; /* temporary storage for the smiles of the current row */
00349         char* tempName = NULL; /* temporary storage for the name of the current row */
00350         DoubleArrayPtr dAPtr; /* pointer on DoubleArray containing the results of getFileProperties */
00351         List_Ptr listPtr; /* pointer on DoubleLinkedList containing the content of the file */
00352         SmilesCompound_Ptr scPtr; /* temporary pointer on a SmilesCompound during insertion of a compound */
00353         char* errorMessage; /* for each unsuccessful compound creation a error message is created */
00354         /* pointer on structure stringArray to store the error messages for unsuccessful compound creations */
00355         StringArray_Ptr sAPtr;
00356         
00357         /* examine the file (number of columns, number of rows, maximum number of chars per row) */
00358         dAPtr = getFileProperties( inputStream, nameOfStream );
00359         
00360         rewind( inputStream );
00361         numberOfColumns = (int) DoubleArray_getValue( dAPtr, 0 );
00362         maxNumberOfCharsPerRow = (int) DoubleArray_getValue( dAPtr, 1 );
00363         numberOfRows = (int) DoubleArray_getValue( dAPtr, 2 );
00364         
00365         #if COUNTSMARTS_DEBUG
00366         fprintf( stdout, "\nNoColumns = %d\tMaxNoChars = %d\tNoRows = %d\n", \
00367                 numberOfColumns, maxNumberOfCharsPerRow, numberOfRows );
00368         #endif
00369 
00370         if( !( tempRow = calloc( maxNumberOfCharsPerRow + 2, sizeof( char ) ) ) )
00371                 MemoryError( "tempRow", "readDataFromStream" );
00372         if( !( tempSmiles = calloc( maxNumberOfCharsPerRow + 2, sizeof( char ) ) ) )
00373                 MemoryError( "tempSmiles", "readDataFromStream" );
00374         if( !( tempName = calloc( maxNumberOfCharsPerRow + 2, sizeof( char ) ) ) )
00375                 MemoryError( "tempName", "readDataFromStream" );
00376         listPtr = List_create( uniqueData, SmilesCompound_display, SmilesCompound_destroy, \
00377                 SmilesCompound_identical );
00378         sAPtr = StringArray_create( );
00379 
00380         /* in the following while loop each row of the file is read and stored in the strucure SmilesCompound */
00381         /* looping over all rows of 'inputSream' */
00382         while( fgets( tempRow, maxNumberOfCharsPerRow + 2, inputStream ) != NULL )
00383         {
00384                 #if COUNTSMARTS_DEBUG
00385                 fprintf( stdout, "tempRow = %s", tempRow ); fflush( NULL );
00386                 #endif
00387 
00388                 /* the input file contains only one column - the SMILES string */
00389                 if( numberOfColumns == 1 )
00390                 {
00391                         strncpy( tempSmiles, tempRow, strlen( tempRow ) - 1 );
00392                         tempSmiles[ strlen( tempRow ) - 1 ] = '\0';
00393                         strncpy( tempName, tempRow, strlen( tempRow ) - 1 );
00394                         tempName[ strlen( tempRow ) - 1 ] = '\0';
00395                 }
00396                 /* the input file contains two columns - the SMILES string and the name */
00397                 else
00398                 {
00399                         substring = strstr( tempRow, "\t" );
00400                         strncpy( tempSmiles, tempRow, ( substring - tempRow ) );
00401                         tempSmiles[ substring - tempRow ] = '\0';
00402                         strncpy( tempName, substring + 1, ( tempRow + strlen( tempRow ) - 1 ) - ( substring + 1 ) );
00403                         tempName[ ( tempRow + strlen( tempRow ) - 1 ) - ( substring + 1 ) ] = '\0';
00404                 }
00405                 #if COUNTSMARTS_DEBUG
00406                 fprintf( stdout, "Smiles: %s\tName: %s\n", tempSmiles, tempName ); fflush( NULL );
00407                 #endif
00408                 /* a SmilesCompound is created and inserted at the tail of the list
00409                 if the insertion fails due to the presence of an identical SmilesCompound it is destroyed */
00410                 scPtr = SmilesCompound_create( tempName, tempSmiles, daylight_type, errorLogFile );
00411                 /* checking if the creation of the SmilesCompound was successful */
00412                 if( scPtr != BOOLEAN_FALSE )
00413                 {
00414                         if( List_insertTail( listPtr, scPtr ) == BOOLEAN_FALSE )
00415                                 SmilesCompound_destroy( scPtr );
00416                 }
00417                 else
00418                 {
00419                         if( !( errorMessage = calloc( 300, sizeof( char ) ) ) )
00420                                 MemoryError( "errorMessage", "readDataFromStream" );
00421                         sprintf( errorMessage, "Error reading compound %s\n", tempName );
00422                         StringArray_addElement( sAPtr, errorMessage );
00423                 }
00424         }
00425         /* if there are any error messages they are printed to stderr */
00426         if( StringArray_getNumberOfElements( sAPtr ) != 0 )
00427                 StringArray_display( sAPtr, stderr );
00428         
00429         /* cleaning up */
00430         free( tempRow );
00431         free( tempSmiles );
00432         free( tempName );
00433         StringArray_destroy( sAPtr );
00434         
00435         return listPtr;
00436 }
00437 
00438 /** Examines the properties of a file stream.
00439 * 
00440 * The number of columns, characters per column and the number of rows are counted and the result
00441 * of this examination are stored in a structure @ref DoubleArray. This @ref DoubleArray contains
00442 * three elements regarded to the properties of dataFile:
00443 *
00444 * @li index 0 - number of columns
00445 * @li index 1 - maximum number of characters per column
00446 * @li index 2 - number of rows
00447 *
00448 * The file is also checked for integrity, i.e. it is checked, if it has the same number of columns in
00449 * all rows. If this is not the case, the define @ref FileIntegrityError is called and the program
00450 * aborts.
00451 *
00452 * @param inputStream file pointer of the stream to examine
00453 * @param nameOfStream string indicating the name of the stream
00454 * @retval DoubleArrayPtr contains the properties of @c inputStream
00455 * @author Uli Fechner
00456 * @version 28/11/2003 - Uli Fechner - initial release
00457 */
00458 DoubleArrayPtr getFileProperties( FILE* inputStream, const char* const nameOfStream )
00459 {
00460         int numberOfColumns = 1; /* number of columns in dataFile */
00461         int maxCharsPerRow = 0; /* maximum number of characters per row in dataFile */
00462         int numberOfRows = 0; /* number of rows in dataFile */
00463         char currentCharacter; /* temporary storage of the character read by fgetc */
00464         int position; /* counter for the position in tempRow */
00465         char* tempRow; /* temporary string containing one row for analysis */
00466         char* substring; /* pointer needed for the 'number of occurences' algorithm */
00467         int counter = 1; /* counter needed by the 'number of occurences' algorithm */
00468         /* dAPtr points on a DoubleArray returning the results.
00469         The DoubleArray contains three values regarding to the properties of dataFile:
00470         index 0 - number of columns
00471         index 1 - maximum number of characters per column
00472         index 2 - number of rows */
00473         DoubleArrayPtr dAPtr; /* the file properties are stored in a DoubleArray structure */
00474         
00475         if( !( tempRow = calloc( 10000, sizeof( char ) ) ) )
00476                 MemoryError( "tempRow", "getFileProperties" );  
00477 
00478         dAPtr = DoubleArray_create( 3 ); /* create a new DoubleArray with 3 elements */
00479         
00480         /* looping over all rows of the file */
00481         while( ( currentCharacter = fgetc( inputStream ) ) != EOF )
00482         {
00483                 /* putting the character back in the stream */
00484                 ungetc( currentCharacter, inputStream );
00485                 
00486                 /* resetting the position counter and the tab counter */
00487                 position = 0;
00488                 counter = 1;
00489                 
00490                 /* getting a line of inputStream */
00491                 while( ( currentCharacter = fgetc( inputStream ) ) != '\n' )
00492                 {
00493                         tempRow[ position ] = currentCharacter;
00494                         position++;
00495                 }
00496                 tempRow[ position ] = '\0';
00497                 numberOfRows++;
00498                 
00499                 /* counting the columns by counting the number of occurences of '\t' */
00500                 substring = strstr( tempRow, "\t" );
00501                 while( substring != NULL)
00502                 {
00503                         counter++;                      
00504                         substring = strstr( substring + 1, "\t" );
00505                 }
00506                 
00507                 /* if the number of columns of the first row are counted the variable 'numberOfColumns' is set */
00508                 if( numberOfRows == 1 )
00509                         numberOfColumns = counter;
00510                 
00511                 /* if the number of columns in one row is different from that in the first row the program aborts */
00512                 if( counter != numberOfColumns )                        
00513                         FileIntegrityError( nameOfStream, numberOfRows, counter, numberOfColumns );
00514                         
00515                 /* keeping the maximum number of chars in one row up to date */
00516                 if( strlen( tempRow ) > maxCharsPerRow )
00517                         maxCharsPerRow = position;
00518         }
00519 
00520         /* checking if there are exactly 2 columns; if not the input file is invalid and the program aborts */
00521         if( numberOfColumns > 2 )
00522         {
00523                 fprintf( stderr, "\nERROR: The number of columns in %s is neither one nor two!\n", nameOfStream );
00524                 fprintf( stderr, "Only input files with the name in the first column and the SMILES string\n" );
00525                 fprintf( stderr, "in the second column or the SMILES string as the only column are valid!\n" );
00526                 AbortProgram;
00527         }
00528         
00529         /* the values regarding to the file property are copied to the struct DoubleArray */
00530         DoubleArray_setValue( dAPtr, 0, numberOfColumns );
00531         DoubleArray_setValue( dAPtr, 1, maxCharsPerRow );
00532         DoubleArray_setValue( dAPtr, 2, numberOfRows );
00533         
00534         free( tempRow );
00535         
00536         return dAPtr;
00537 }
00538 
00539 /** Displays the help text on standard error.
00540 *
00541 * @author UF
00542 * @version 22/04/04 - UF - initial release
00543 */
00544 void displayHelpText( )
00545 {
00546         fprintf( stderr, "\nNAME:\n" );
00547         fprintf( stderr, "  countSmarts\n" );
00548         fprintf( stderr, "\nFUNCTION:\n" );
00549         fprintf( stderr, "  Counts the occurrences of SMARTS in a SMILES.\n" );
00550         fprintf( stderr, "\nUSAGE:\n" );
00551         fprintf( stderr, "  countSmarts [Options] <INFILE\n" );
00552         fprintf( stderr, "     INFILE has to be a file with one SMILES per row or a tab separated file\n" );
00553         fprintf( stderr, "     with SMILES in the first and labels in the second column.\n" );
00554         fprintf( stderr, "\nREMARK:\n" );
00555         fprintf( stderr, "  An error log file is automatically created. Its name is the suffix '.log'\n" );
00556         fprintf( stderr, "  appended to the name of the output file.\n" );
00557         fprintf( stderr, "\nOPTIONS:\n" );
00558         fprintf( stderr, "  -h\n" );
00559         fprintf( stderr, "     Display this help text.\n" );
00560         fprintf( stderr, "  -v\n" );
00561         fprintf( stderr, "     Display detailed version information and exit.\n" );
00562         fprintf( stderr, "  -o FILENAME @ MANDATORY\n" );
00563         fprintf( stderr, "     Set FILENAME as the name of the output file.\n" );
00564         fprintf( stderr, "  -s FILENAME @ MANDATORY\n" );
00565         fprintf( stderr, "     Set FILENAME as the name of the SMARTS file (same format as the INFILE).\n" );
00566         fprintf( stderr, "  -m INTEGER @ [1,3] @ DEFAULT = 2\n" );
00567         fprintf( stderr, "     Specifies the SMARTS match type:\n" );
00568         fprintf( stderr, "     1 = exhaustive search [dt_match,0]\n" );
00569         fprintf( stderr, "     2 = unique set-of-atoms in results [dt_umatch,0])\n" );
00570         fprintf( stderr, "     3 = each atom appears in exactly one result [dt_xmatch,0]\n" );
00571         fprintf( stderr, "  -u\n" );
00572         fprintf( stderr, "     Only unique SMILES are written to the output file.\n" );
00573         fprintf( stderr, "\nAUTHOR:\n" );
00574         fprintf( stderr, "  Uli Fechner\n" );
00575         fprintf( stderr, "\nVERSION & RELEASE DATE:\n" );
00576         fprintf( stderr, "  %s ( %s )\n", COUNTSMARTS_VERSION, COUNTSMARTS_DATE );
00577         fprintf( stderr, "\nBUGS:\n" );
00578         fprintf( stderr, "  Please report bugs to u.fechner@chemie.uni-frankfurt.de\n\n" );
00579         exit( EXIT_SUCCESS );
00580 }
00581 
00582 /** Displays the version information text on standard error.
00583 *
00584 * @author Uli Fechner
00585 * @version 28/11/2003 - Uli Fechner - initial release
00586 */
00587 void displayVersionInformation( void )
00588 {
00589         fprintf( stderr, "\nVERSION & RELEASE DATE:\n\n");
00590         fprintf( stderr, "  includes.h: %s [%s]\n", INCLUDES_VERSION, INCLUDES_DATE );
00591         fprintf( stderr, "  generalDefines.h: %s [%s]\n", GENERALDEFINES_VERSION, GENERALDEFINES_DATE );
00592         fprintf( stderr, "  givenClp.c: %s [%s]\n", GIVENCLP_VERSION, GIVENCLP_DATE );
00593         fprintf( stderr, "  doubleArray.c: %s [%s]\n", DOUBLEARRAY_VERSION, DOUBLEARRAY_DATE );
00594         fprintf( stderr, "  clp.c: %s [%s]\n", CLP_VERSION, CLP_DATE );
00595         fprintf( stderr, "  smilesCompound.c: %s [%s]\n", SMILESCOMPOUND_VERSION, SMILESCOMPOUND_DATE );
00596         fprintf( stderr, "  doubleLinkedList.c: %s [%s]\n", DOUBLELINKEDLIST_VERSION, DOUBLELINKEDLIST_DATE );
00597         fprintf( stderr, "  countSmarts core: %s [%s]\n\n", COUNTSMARTS_VERSION, COUNTSMARTS_DATE );
00598         exit( EXIT_SUCCESS );
00599 }

Generated on Mon Nov 8 16:04:06 2004 for countSmarts by doxygen 1.3.6