Main Page   Compound List   File List   Compound Members   File Members  

ngram2mgram.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00039 #define BINARY 1
00040 #define ASCII 2
00041 #define WORDS 3
00042 
00043 #define NUMERIC 1
00044 #define ALPHA 2
00045 
00046 #include <stdio.h>
00047 #include <stdlib.h>
00048 #include <string.h>
00049 #include "toolkit.h"
00050 #include "ngram.h"
00051 #include "pc_libs/pc_general.h"
00052 #include "rr_libs/general.h"
00053 
00054 /***************************
00055       MAIN FUNCTION
00056  ***************************/
00057 
00058 void main(int argc, char *argv[]) {
00059 
00060   int verbosity;
00061   int n;
00062   int m;
00063   int i;
00064   int input_type;
00065   int storage_type;
00066   unsigned short *current_ngram_int;
00067   unsigned short *previous_ngram_int;
00068   char **current_ngram_text;
00069   char **previous_ngram_text;
00070   int current_count;
00071   int running_total;
00072   flag same;
00073   flag first_one;
00074   flag got_to_eof;
00075    
00076   running_total = 0;
00077 
00078   report_version(&argc,argv);
00079 
00080   if (pc_flagarg( &argc, argv,"-help") || argc==1) {
00081     fprintf(stderr,"ngram2mgram - Convert an n-gram file to an m-gram file, where m<n\n");
00082     fprintf(stderr,"Usage : ngram2mgram   -n N -m M\n");
00083     fprintf(stderr,"                    [ -binary | -ascii | -words ]\n");
00084     fprintf(stderr,"                    < .ngram > .mgram\n");
00085     exit(1);
00086   }
00087  
00088   n = pc_intarg( &argc, argv,"-n",0);
00089   m = pc_intarg( &argc, argv,"-m",0);
00090   verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
00091   
00092 
00093   input_type = 0;
00094   
00095   if (pc_flagarg( &argc, argv,"-binary")) {
00096     input_type = BINARY;
00097   }
00098 
00099   if (pc_flagarg( &argc, argv,"-ascii")) {
00100     if (input_type != 0) {
00101       quit(-1,"Error : more than one file format specified.\n");
00102     }
00103     input_type = ASCII;
00104   }
00105 
00106   if (pc_flagarg( &argc, argv,"-words")) {  
00107     if (input_type != 0) {
00108       quit(-1,"Error : more than one file format specified.\n");
00109     }
00110     input_type = WORDS;
00111   }    
00112 
00113   if (input_type == 0) {
00114     pc_message(verbosity,2,"Warning : no input type specified. Defaulting to binary.\n");
00115     input_type = BINARY;
00116   }
00117 
00118   if (n == 0) {
00119     quit(-1,"Must specify a value for n. Use the -n switch.\n");
00120   }
00121 
00122   if (m == 0) {
00123     quit(-1,"Must specify a value for m. Use the -m switch.\n");
00124   }
00125   
00126   if (n<=m) {
00127     quit(-1,"n must be greater than m.\n");
00128   }
00129 
00130   pc_report_unk_args(&argc,argv,verbosity);
00131 
00132   if (input_type == BINARY || input_type == ASCII) {
00133     storage_type = NUMERIC;
00134   }
00135   else {
00136     storage_type = ALPHA;
00137   }
00138 
00139   if (storage_type == NUMERIC) {
00140     current_ngram_int = (unsigned short *) 
00141       rr_malloc(n*sizeof(unsigned short));
00142     previous_ngram_int = (unsigned short *) 
00143       rr_malloc(n*sizeof(unsigned short));
00144 
00145     /* And to prevent compiler warnings ... */
00146 
00147     current_ngram_text = NULL;
00148     previous_ngram_text = NULL;
00149   }
00150   else {
00151     current_ngram_text = (char **) rr_malloc(n*sizeof(char *));
00152     previous_ngram_text = (char **) rr_malloc(n*sizeof(char *));
00153     for (i=0;i<=n-1;i++) {
00154       current_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
00155       previous_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
00156     }
00157 
00158     /* And to prevent compiler warnings ... */
00159 
00160     current_ngram_int = NULL;
00161     previous_ngram_int = NULL;
00162 
00163   }
00164 
00165   got_to_eof = 0;
00166   first_one = 1;
00167 
00168   while (!rr_feof(stdin)) {
00169 
00170     /* Store previous n-gram */
00171 
00172     if (!first_one) {
00173 
00174       if (storage_type == NUMERIC) {
00175         for (i=0;i<=n-1;i++) {
00176           previous_ngram_int[i] = current_ngram_int[i];
00177         }
00178       }
00179       else {
00180         for (i=0;i<=n-1;i++) {
00181           strcpy(previous_ngram_text[i],current_ngram_text[i]);
00182         }
00183       }
00184 
00185     }
00186 
00187     /* Read new n-gram */
00188 
00189     switch(input_type) {
00190     case BINARY:
00191       for (i=0;i<=n-1;i++) {
00192         rr_fread(&current_ngram_int[i],sizeof(id__t),1,stdin,
00193                  "from id_ngrams at stdin",0);
00194       }
00195       rr_fread(&current_count,sizeof(count_t),1,stdin,
00196                "from id_ngrams file at stdin",0);
00197       break;
00198     case ASCII:
00199       for (i=0;i<=n-1;i++) {
00200         if (fscanf(stdin,"%hu",&current_ngram_int[i]) != 1) {
00201           if (!rr_feof(stdin)) {
00202             quit(-1,"Error reading id_ngram.\n");
00203           }
00204           else {
00205             got_to_eof = 1;
00206           }
00207         }
00208       }
00209       if (fscanf(stdin,"%d",&current_count) != 1) {
00210         if (!rr_feof(stdin)) {
00211           quit(-1,"Error reading id_ngram.\n");
00212         }
00213         else {
00214           got_to_eof = 1;
00215         }
00216       }
00217       break;
00218     case WORDS:
00219       for (i=0;i<=n-1;i++) {
00220         if (fscanf(stdin,"%s",current_ngram_text[i]) != 1) {
00221           if (!rr_feof(stdin)) {
00222             quit(-1,"Error reading id_ngram.\n");
00223           }
00224           else {
00225             got_to_eof = 1;
00226           }
00227         }
00228       }
00229       if (fscanf(stdin,"%d",&current_count) != 1) {
00230         if (!rr_feof(stdin)) {
00231           quit(-1,"Error reading id_ngram.\n");
00232         }
00233         else {
00234           got_to_eof = 1;
00235         }
00236       }
00237       break;
00238     }
00239 
00240     if (!got_to_eof) {
00241 
00242       /* Check for correct sorting */
00243 
00244       if (!first_one) {
00245 
00246         switch(storage_type) {
00247         case NUMERIC:
00248           for (i=0;i<=n-1;i++) {
00249             if (current_ngram_int[i]<previous_ngram_int[i]) {
00250               quit(-1,"Error : ngrams not correctly sorted.\n");
00251             }
00252             else {
00253               if (current_ngram_int[i]>previous_ngram_int[i]) {
00254                 i=n;
00255               }
00256             }
00257           }
00258           break;
00259         case ALPHA:
00260           for (i=0;i<=n-1;i++) {
00261             if (strcmp(current_ngram_text[i],previous_ngram_text[i])<0) {
00262               quit(-1,"Error : ngrams not correctly sorted.\n");
00263             }
00264             else {
00265               if (strcmp(current_ngram_text[i],previous_ngram_text[i])>0) {
00266                 i=n;
00267               }
00268             }
00269           }
00270           break;
00271         }
00272       }
00273 
00274       /* Compare this m-gram with previous m-gram */
00275 
00276       if (!first_one) {
00277 
00278         switch(storage_type) {
00279         case NUMERIC:
00280           same = 1;
00281           for (i=0;i<=m-1;i++) {
00282             if (current_ngram_int[i] != previous_ngram_int[i]) {
00283               same = 0;
00284             }
00285           }
00286           if (same) {
00287             running_total += current_count;
00288           }
00289           else {
00290             if (input_type == ASCII) {
00291               for (i=0;i<=m-1;i++) {
00292                 printf("%d ",previous_ngram_int[i]);
00293               }
00294               printf("%d\n",running_total);
00295             }
00296             else {
00297               for (i=0;i<=m-1;i++) {
00298                 rr_fwrite(&previous_ngram_int[i],sizeof(id__t),1,stdout,
00299                           "to id_ngrams at stdout");
00300               }
00301               rr_fwrite(&running_total,sizeof(count_t),1,stdout,
00302                         "to id n-grams at stdout");
00303             }
00304             running_total = current_count;
00305           }
00306           break;
00307         case ALPHA:
00308           same = 1;
00309           for (i=0;i<=m-1;i++) {
00310             if (strcmp(current_ngram_text[i],previous_ngram_text[i])) {
00311               same = 0;
00312             }
00313           }
00314           if (same) {
00315             running_total += current_count;
00316           }
00317           else {
00318             for (i=0;i<=m-1;i++) {
00319               printf("%s ",previous_ngram_text[i]);
00320             }
00321             printf("%d\n",running_total);
00322             running_total = current_count;
00323           
00324           }
00325           break;
00326         }
00327       
00328       }
00329       else {
00330         running_total = current_count;
00331       } 
00332     
00333       first_one = 0;
00334     
00335     }
00336   }
00337 
00338   /* Write out final m-gram */
00339 
00340   switch(input_type) {
00341   case BINARY:
00342     break;
00343   case ASCII:
00344     for (i=0;i<=m-1;i++) {
00345       printf("%d ",previous_ngram_int[i]);
00346     }
00347     printf("%d\n",running_total);
00348     break;
00349   case WORDS:
00350     for (i=0;i<=m-1;i++) {
00351       printf("%s ",previous_ngram_text[i]);
00352     }
00353     printf("%d\n",running_total);
00354     break;
00355   } 
00356 
00357   pc_message(verbosity,0,"ngram2mgram : Done.\n");
00358 
00359   exit(0);
00360 
00361 }         

Generated on Tue Dec 21 13:54:45 2004 by doxygen1.2.18