/* -------------------------------------------------------------------------- * * tod.c - Daylight TDT Orderer * * This is the mainline routine of a "filter" program that reads * a TDT and puts the data into the order specified in the datatypes file. * The datatypes order determines the TDT tree, NOT the input file. * In the case of repeating identifier tags in the input file, the * data between the tags is (optionally) subsorted in datatype file order. * This includes all data except those with primary data tags. * Invalid characters in data tags, those other than A-Z, a-z, 0-9 and $, * are automatically removed from the incoming datatags. * Data fields with datatypes not in the datatypes files are eliminated but reported. * Datatype definitions not found in the input file are reported. * Any length input data fields are handled, with any number of datatags. * Datatype tag translation may also be accomplished. * If alternative data tags follow the data tag on the "$D" line, then * alternative data tags in the input line will be mapped to the correct tag. * Standard input is read and output is written to standard output. * * Usage: tod -fn -lm -v -idatatypes < input.tdt > output.tdt 2> errs * * -f first record to process * -l last record to process * -v verbose * -i name of datatypes file (required) with optional mappings * -t test - no TDTs are written * -n do not sort output records * -u sort, but do not subsort output records * * --------------------------------------------------------------------------*/ #include #if MEM_DEBUG #include #endif #include #include #define USEMSG "Usage: %s -idatatypes [-v] [-t] [-n] [-u] [-f first-record] [-l last-record]\n" #define TRUE 1 #define FALSE 0 typedef struct tdt_datatype { char *data_tag; char *data_value; short rank; short inv; } TDT_DATATYPE, *TDT_DATATYPE_PTR; typedef struct tdt_datatag { char *old_tag; char *new_tag; } TDT_DATATAG, *TDT_DATATAG_PTR; extern int optind; /* argv index of next argument */ extern char *optarg; /* -> option argument */ static unsigned long first, last; static int verbose; static int dump; static int sort; static int subsort; static FILE *infile = NULL; static FILE *errfile = NULL; static FILE *tagfile = NULL; static void getargs(); static TDT_DATATAG *proc_tags(int *num_lines); static void proc_tdts(TDT_DATATAG *tag_data, int taglines); static char **sortTDT(TDT_DATATYPE *tdt_data,int tdt_lines,TDT_DATATAG *tag_data,int tag_lines, int *num_defs); static int subsortTDT(TDT_DATATYPE *tdt_data, TDT_DATATAG *tag_data, int tag_lines, int start, int end, int *rrank); static char *fixTag(char *buff); /************************************************************************/ /* */ /* Mainline processing. */ /* */ /************************************************************************/ main(argc, argv) int argc; char *argv[]; { int i; TDT_DATATAG *tag_data; int tag_lines; infile = stdin; errfile = stderr; #if MEM_DEBUG /* dbgMemSetSafetyLevel(MEM_SAFETY_DEBUG); */ dbgMemSetDefaultErrorOutput(DBGMEM_OUTPUT_CONSOLE,""); #endif getargs(argc,argv); tag_data = proc_tags(&tag_lines); proc_tdts(tag_data,tag_lines); /* Free the data tag structures */ for (i=0; i < tag_lines; i++) { free(tag_data[i].old_tag); free(tag_data[i].new_tag); } free(tag_data); fclose(infile); fclose(errfile); #if MEM_DEBUG MemPoolCheck(MemDefaultPool); dbgMemReportLeakage(NULL,1,1); #endif /* All done */ exit(0); } /************************************************************************/ /* */ /* Process all TDT data tag records */ /* */ /************************************************************************/ static TDT_DATATAG *proc_tags(int *num_lines) { char *lineBuf, *readBuf; int i; int start_size = 255; int inc_size = 80; int line_size, old_size; char *oldTag, *newTag; TDT_DATATAG *tag_data; int tag_lines; line_size = start_size; /* Allocate the the initial lineBuffer */ lineBuf = calloc(line_size,sizeof(char)); /* if (verbose) fprintf(errfile,"Initial input line size %d\n",line_size); */ /* Allocate space for a 1 line TAG record */ tag_data = (TDT_DATATAG_PTR) malloc(sizeof(TDT_DATATAG)); tag_lines = 0; /***************************************/ /* Read in each line from the tag file */ /***************************************/ for (; (readBuf = fgets(lineBuf,line_size,tagfile)) != NULL;) { /* If the buffer is too short, reallocate and keep reading */ while ((strlen(lineBuf) >= (line_size-1)) && (lineBuf[strlen(lineBuf)-1] != '\n')) { /* if (verbose) fprintf(errfile,"Partial input line: '%s'\n",readBuf); */ old_size = line_size; line_size += inc_size; /* if (verbose) fprintf(errfile,"Input line size incremented to %d\n",line_size); */ lineBuf = realloc(lineBuf,line_size*sizeof(char)); readBuf = lineBuf + old_size*sizeof(char) -1; if ((readBuf = fgets(readBuf,inc_size+1,tagfile)) == NULL) { fprintf(errfile,"Error - EOF on input line %s\n",lineBuf); exit(1); } } /* if (verbose) fprintf(errfile,"Input line: '%s'\n",lineBuf); */ /*********************************************/ /* If this is not the data tag name, skip it */ /*********************************************/ if (strncmp(lineBuf,"$D",2)) continue; /***************************************/ /* Copy input line into the tag struct */ /***************************************/ /* Get the first data tag */ newTag = strtok(lineBuf,"<"); newTag = strtok(0,">"); oldTag = strtok(0," ,;:\t\n"); /* If no new data tag in the file, use the old one */ if (!oldTag) { /* if (verbose) fprintf(errfile,"No old datatag for tag %s\n",newTag); */ oldTag = newTag; } /* Add another TDT TAG struct is too small */ if (tag_lines) tag_data = (TDT_DATATAG_PTR) realloc(tag_data,(tag_lines+1)*sizeof(TDT_DATATAG)); /* Store the Tags by allocating new space */ tag_data[tag_lines].old_tag = malloc((strlen(oldTag)+1)*sizeof(char)); tag_data[tag_lines].new_tag = malloc((strlen(newTag)+1)*sizeof(char)); /* Copy the old data tag into tag structure */ strcpy(tag_data[tag_lines].old_tag,oldTag); /* Copy the new data tag into tag structure */ strcpy(tag_data[tag_lines].new_tag,newTag); tag_lines++; } /* Dump it out */ if (verbose) { fprintf(errfile,"Data Tags:\n"); for (i=0; i < tag_lines; i++) { fprintf(errfile,"Old tag: %s New tag: %s\n",tag_data[i].old_tag,tag_data[i].new_tag); } } free(lineBuf); fclose(tagfile); *num_lines = tag_lines; return(tag_data); } /************************************************************************/ /* */ /* Process all TDT records */ /* */ /************************************************************************/ static void proc_tdts(TDT_DATATAG *tag_data, int tag_lines) { char *lineBuf, *readBuf; int i, j, count, flag; int start_size = 255; int inc_size = 80; int line_size, old_size; TDT_DATATYPE *tdt_data; int max_tdt_lines; int tdt_lines = 0; char *sep; int tagLength, valueLength; char **unfound_tags = NULL; int num_unfound_tags = 0; char **unused_defs = NULL; int num_unused_defs = 0; count = 1; line_size = start_size; /* Allocate the the initial lineBuffer */ lineBuf = calloc(line_size,sizeof(char)); /* if (verbose) fprintf(errfile,"Initial input line size %d\n",line_size); */ /* Allocate space for a 1 line TDT record */ tdt_data = (TDT_DATATYPE_PTR) malloc(sizeof(TDT_DATATYPE)); max_tdt_lines = 1; /* Allocate an initial TDT with a one char tag and value */ tdt_data[0].data_tag = malloc(sizeof(char)); tdt_data[0].data_value = malloc(sizeof(char)); /*****************************************/ /* Read in each line from the input file */ /*****************************************/ for (; (readBuf = fgets(lineBuf,line_size,infile)) != NULL;) { /* If the buffer is too short, reallocate and keep reading */ while ((strlen(lineBuf) >= (line_size-1)) && (lineBuf[strlen(lineBuf)-1] != '\n')) { /* if (verbose) fprintf(errfile,"Partial input line: '%s'\n",readBuf); */ old_size = line_size; line_size += inc_size; /* if (verbose) fprintf(errfile,"Input line size incremented to %d\n",line_size); */ lineBuf = realloc(lineBuf,line_size*sizeof(char)); readBuf = lineBuf + old_size*sizeof(char) -1; if ((readBuf = fgets(readBuf,inc_size+1,infile)) == NULL) { fprintf(errfile,"Error - EOF on input line %s\n",lineBuf); exit(1); } } /* if (verbose) fprintf(errfile,"Input line: '%s'\n",lineBuf); */ /*********************************************/ /* If this is the end of the TDT, process it */ /*********************************************/ if (!strncmp(lineBuf,"|",1)) { /* If this TDT is in range, process it */ if (count >= first) { /* Replace substitute data tags */ for (i=0; i < tdt_lines; i++) { /* Initialize ranks for TDT records */ tdt_data[i].rank = 0; tdt_data[i].inv = -1; /* Remove any bad characters from the input tag */ tdt_data[i].data_tag = fixTag(tdt_data[i].data_tag); flag = FALSE; for (j=0; j < tag_lines; j++) { /* Found old tag, replace it with new tag */ if (!strcmp(tdt_data[i].data_tag,tag_data[j].old_tag)) { /* If new tag was larger than old tag, allocate more space */ if (strlen(tag_data[j].new_tag) > (strlen(tdt_data[i].data_tag))) { tdt_data[i].data_tag = realloc(tdt_data[i].data_tag,(strlen(tag_data[j].new_tag)+1)*sizeof(char)); } strcpy(tdt_data[i].data_tag,tag_data[j].new_tag); flag = TRUE; break; } /* Found new tag, so we are done */ else if (!strcmp(tdt_data[i].data_tag,tag_data[j].new_tag)) { flag = TRUE; break; } } /* Data tag was not found, null the TDT line */ if (!flag) { if (verbose) fprintf(errfile,"Data tag %s not found in type definition\n",tdt_data[i].data_tag); /* Has this tag been stored befored */ flag = FALSE; for (j=0; j < num_unfound_tags; j++) { if (!strcmp(tdt_data[i].data_tag,unfound_tags[j])) { flag = TRUE; break; } } /* New "not found" tag, store it */ if (!flag) { /* Allocate space for the tag */ if (num_unfound_tags == 0) { unfound_tags = (char **) malloc(sizeof(char *)); } else { unfound_tags = (char **) realloc(unfound_tags,(num_unfound_tags+1)*sizeof(char *)); } unfound_tags[num_unfound_tags] = (char *)malloc((strlen(tdt_data[i].data_tag)+1)*sizeof(char)); /* Save the not found tag for display in a summary */ strcpy(unfound_tags[num_unfound_tags],tdt_data[i].data_tag); num_unfound_tags++; } tdt_data[i].data_tag[0] = '\0'; tdt_data[i].data_value[0] = '\0'; } } unused_defs = sortTDT(tdt_data,tdt_lines,tag_data,tag_lines,&num_unused_defs); /* Dump out TDT records in order using the inverted rank */ if (dump) { if (verbose) printf("RECORD %d\n",count); flag = FALSE; for (i=0; i < tdt_lines; i++) { if (sort) j = tdt_data[i].inv; else j = i; if ((j != -1) && (strlen(tdt_data[j].data_tag))) { printf("%s%s\n",tdt_data[j].data_tag,tdt_data[j].data_value); flag = TRUE; } } if (flag) printf("|\n"); } } /* Quit if past the last TDT */ count++; if ((last) && (count > last)) break; /* Get next TDT */ tdt_lines = 0; continue; } /********************************/ /* Copy input line into the TDT */ /********************************/ /* Get the data tag */ sep = (char *)strchr(lineBuf,'<'); if (!sep) { fprintf(errfile,"No datatag seperator in line %s - skipping \n",lineBuf); continue; } tagLength = sep-lineBuf; valueLength = strlen(sep) - 1; /* no newline */ /* If number of lines in TDT struct is too small, add another one */ if (tdt_lines >= max_tdt_lines) { tdt_data = (TDT_DATATYPE_PTR) realloc(tdt_data,(tdt_lines+1)*sizeof(TDT_DATATYPE)); /* Store the TDT line by allocating new space */ tdt_data[tdt_lines].data_tag = malloc((tagLength+1)*sizeof(char)); tdt_data[tdt_lines].data_value = malloc((valueLength+1)*sizeof(char)); max_tdt_lines++; /* if (verbose) fprintf(errfile,"Allocated a new TDT line (%d) to TDT structure\n",tdt_lines); */ /* if (verbose) fprintf(errfile,"- allocated a new TDT tag of size %d\n",tagLength+1); */ /* if (verbose) fprintf(errfile,"- allocated a new TDT value of size %d\n",valueLength+1); */ } /* Store the TDT line by reallocating existing space */ else { tdt_data[tdt_lines].data_tag = realloc(tdt_data[tdt_lines].data_tag,(tagLength+1)*sizeof(char)); /* if (verbose) fprintf(errfile,"Reallocated a TDT tag for line %d to size %d\n",tdt_lines,tagLength+1); */ tdt_data[tdt_lines].data_value = realloc(tdt_data[tdt_lines].data_value,(valueLength+1)*sizeof(char)); /* if (verbose) fprintf(errfile,"Reallocated a new TDT value for line %d to size %d\n",tdt_lines,valueLength+1); */ } /* Copy the data tag from the TDT line */ strncpy(tdt_data[tdt_lines].data_tag,lineBuf,tagLength); tdt_data[tdt_lines].data_tag[tagLength] = '\0'; /* Copy the value of TDT onto the TDT line without the newline */ strncpy(tdt_data[tdt_lines].data_value,sep,valueLength); tdt_data[tdt_lines].data_value[valueLength] = '\0'; tdt_lines++; } /* Report all tags in input not found in the datatype definition */ if (num_unfound_tags) fprintf(errfile,"Input data discarded:\n(Tags present in input file not in datatypes definition)\n"); for (i=0; i < num_unfound_tags; i++) { fprintf(errfile,"%s\n",unfound_tags[i]); /* Free tag after reporting on it */ free(unfound_tags[i]); } if (num_unfound_tags) free(unfound_tags); /* Report all datatype definitions not found in the input file */ if (num_unused_defs) fprintf(errfile,"\nInput data types not found in some or all TDT records:\n(Datatypes definitions not found in input file)\n"); for (i=0; i < num_unused_defs; i++) { fprintf(errfile,"%s\n",unused_defs[i]); /* Free tag after reporting on it */ free(unused_defs[i]); } if (num_unused_defs) free(unused_defs); /* Free all tdt data structures */ for (i=0; i < max_tdt_lines; i++) { free(tdt_data[i].data_tag); free(tdt_data[i].data_value); } free(tdt_data); free(lineBuf); return; } /*********************************************************************/ /* */ /* Strip bad characters from datatype tags. */ /* */ /*********************************************************************/ static char *fixTag(char *buff) { int i,j; char *ostring; ostring = (char *)malloc((strlen(buff)+1)*sizeof(char)); for (i=0,j=0; buff[i] != '\0'; i++) { if (((buff[i] >= '0') && (buff[i] <= '9')) || ((buff[i] >= 'A') && (buff[i] <= 'Z')) || ((buff[i] >= 'a') && (buff[i] <= 'z')) || (buff[i] == '$')) { ostring[j] = buff[i]; j++; } else { if (verbose) fprintf(errfile,"Bad character '%c' removed from tag %s\n",buff[i],buff); } } ostring[j] = '\0'; strcpy(buff,ostring); free(ostring); return (buff); } /************************************************************************/ /* */ /* Sort the lines in the TDT record */ /* */ /* If the subsort flag is true, TDT record data will be subsorted */ /* within repeating identifier data tag fields, except those data */ /* with (different) identifier data tags. */ /* */ /************************************************************************/ static char **sortTDT(TDT_DATATYPE *tdt_data,int tdt_lines,TDT_DATATAG *tag_data,int tag_lines, int *num_defs) { int i, j, flag; char **unused_defs = NULL; int num_unused_defs = 0; int count, start, end; int rank = 1; start = 0; end = tdt_lines-1; /* For each tag, find all matches with tdt lines and assign them a rank */ for (i=0; i < tag_lines; i++) { flag = FALSE; count = -1; /* Since tag may occur several times, just can't break out of this loop */ for (j=start; j <= end; j++) { if ((strlen(tdt_data[j].data_tag)) && (!strcmp(tag_data[i].new_tag,tdt_data[j].data_tag))) { /* Rank TDT line if not already ranked */ if (!tdt_data[j].rank) { /* If this is the second (or more) occurance of a primary datatag */ /* subsort all data between the tags */ if (tdt_data[j].data_tag[0] == '$') { if ((count != -1) && (subsort)) rank = subsortTDT(tdt_data,tag_data,tag_lines,count,j,&rank); count = j; } /* Assign a rank to this tdt line if it matches the tag */ tdt_data[j].rank = rank; tdt_data[rank-1].inv = j; rank++; } /* Make sure flag is true as subsortTDT() may have marked it */ flag = TRUE; /* Reduce size of inner loop as tags are assigned */ /* If we have assigned the first line, let's not check it again */ if (j == start) start++; /* If we have assigned the last line, let's not check it again */ else if (j == end) end--; } } /* Data definition not found. Save it for the summary */ if (!flag) { if (verbose) fprintf(errfile,"Data type def tag %s not found in input TDT\n",tag_data[i].new_tag); /* Has this unused definition been stored before? */ flag = FALSE; for (j=0; j < num_unused_defs; j++) { if (!strcmp(tag_data[i].new_tag,unused_defs[j])) { flag = TRUE; break; } } /* New unused definition, store it */ if (!flag) { /* Allocate space for the unused definition */ if (num_unused_defs == 0) { unused_defs = (char **) malloc(sizeof(char *)); } else { unused_defs = (char **) realloc(unused_defs,(num_unused_defs+1)*sizeof(char *)); } unused_defs[num_unused_defs] = (char *)malloc((strlen(tag_data[i].new_tag)+1)*sizeof(char)); /* Save the not found tag for display in a summary */ strcpy(unused_defs[num_unused_defs],tag_data[i].new_tag); num_unused_defs++; } } } /* Information to return */ *num_defs = num_unused_defs; return(unused_defs); } /************************************************************************/ /* */ /* Subort the lines in TDT record between two primary tags. */ /* */ /* Subsort DT records within repeating primary data tag fields */ /* identified by "start" and "end". Do not subsort any data */ /* that have identifier data tags. */ /* */ /************************************************************************/ static int subsortTDT(TDT_DATATYPE *tdt_data, TDT_DATATAG *tag_data, int tag_lines, int start, int end, int *rrank) { int i, j, rank; int count; rank = *rrank; start++; end--; /* For each tag, find all matches with tdt lines and assign them a rank */ for (i=0; i < tag_lines; i++) { /* Only subsort lines with tags that are not primary identifiers */ if (tag_data[i].new_tag[0] != '$') { count = -1; /* Sort all TDT lines between start and end */ for (j=start; j <= end; j++) { if ((strlen(tdt_data[j].data_tag)) && (!strcmp(tag_data[i].new_tag,tdt_data[j].data_tag))) { /* Rank TDT line if not already ranked */ if (!tdt_data[j].rank) { #if 0 /* No secondary identifiers permitted */ /* If this is the second occurance of a primary datatag */ /* keep all data between the tags and subsort it */ if (tdt_data[j].data_tag[0] == '$') { if (count != -1) rank = subsortTDT(tdt_data,tag_data,tag_lines,count,j,&rank); count = j; } #endif /* Assign a rank to this tdt line if it matches the tag */ tdt_data[j].rank = rank; tdt_data[rank-1].inv = j; rank++; } /* Reduce size of inner loop as tags are assigned */ /* If we have assigned the first line, let's not check it again */ if (j == start) start++; /* If we have assigned the last line, let's not check it again */ else if (j == end) end--; } } } } *rrank = rank; return(rank); } /************************************************************************/ /* */ /* Get initial arguments from the command line. */ /* */ /************************************************************************/ static void getargs(argc,argv) int argc; /* argument count */ char *argv[]; /* -> argument array */ { int c; /* option flag character */ int errflag = 0; /* error flag found in arg list? */ char ifile[255] = ""; /* input file name */ first = 1; last = 0; verbose = FALSE; dump = TRUE; sort = TRUE; subsort = TRUE; /* process command line arguments */ while((c = getopt(argc,argv,"i:f:l:vtnu")) != EOF) { switch(c) { case 'i': /* input file */ strcpy(ifile, optarg); /* Open input file if specified */ if ((tagfile = fopen(ifile,"r")) == NULL) { fprintf(errfile,"Could not open input tag file %s\n",ifile); exit(1); } break; case 'v': verbose = TRUE; break; case 'n': sort = FALSE; break; case 'u': subsort = FALSE; break; case 't': dump = FALSE; break; case 'f': /* record number to start with */ first = atoi(optarg); break; case 'l': /* record number to end with */ last = atoi(optarg); break; case '?': /* unknown option flag */ errflag++; } } /* Check for command line errors */ if (errflag || (argc > optind)) { fprintf(errfile,USEMSG, argv[0] ); exit(2); } if (!tagfile) { fprintf(errfile,"No datatype tag file specified\n"); fprintf(errfile,USEMSG, argv[0] ); exit(3); } return; }