00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00039 #define BINARY 1
00040 #define ASCII 2
00041 #define WORDS 3
00042
00043 #define NUMERIC 1
00044 #define ALPHA 2
00045
00046 #include <stdio.h>
00047 #include <stdlib.h>
00048 #include <string.h>
00049 #include "toolkit.h"
00050 #include "ngram.h"
00051 #include "pc_libs/pc_general.h"
00052 #include "rr_libs/general.h"
00053
00054
00055
00056
00057
00058 void main(int argc, char *argv[]) {
00059
00060 int verbosity;
00061 int n;
00062 int m;
00063 int i;
00064 int input_type;
00065 int storage_type;
00066 unsigned short *current_ngram_int;
00067 unsigned short *previous_ngram_int;
00068 char **current_ngram_text;
00069 char **previous_ngram_text;
00070 int current_count;
00071 int running_total;
00072 flag same;
00073 flag first_one;
00074 flag got_to_eof;
00075
00076 running_total = 0;
00077
00078 report_version(&argc,argv);
00079
00080 if (pc_flagarg( &argc, argv,"-help") || argc==1) {
00081 fprintf(stderr,"ngram2mgram - Convert an n-gram file to an m-gram file, where m<n\n");
00082 fprintf(stderr,"Usage : ngram2mgram -n N -m M\n");
00083 fprintf(stderr," [ -binary | -ascii | -words ]\n");
00084 fprintf(stderr," < .ngram > .mgram\n");
00085 exit(1);
00086 }
00087
00088 n = pc_intarg( &argc, argv,"-n",0);
00089 m = pc_intarg( &argc, argv,"-m",0);
00090 verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
00091
00092
00093 input_type = 0;
00094
00095 if (pc_flagarg( &argc, argv,"-binary")) {
00096 input_type = BINARY;
00097 }
00098
00099 if (pc_flagarg( &argc, argv,"-ascii")) {
00100 if (input_type != 0) {
00101 quit(-1,"Error : more than one file format specified.\n");
00102 }
00103 input_type = ASCII;
00104 }
00105
00106 if (pc_flagarg( &argc, argv,"-words")) {
00107 if (input_type != 0) {
00108 quit(-1,"Error : more than one file format specified.\n");
00109 }
00110 input_type = WORDS;
00111 }
00112
00113 if (input_type == 0) {
00114 pc_message(verbosity,2,"Warning : no input type specified. Defaulting to binary.\n");
00115 input_type = BINARY;
00116 }
00117
00118 if (n == 0) {
00119 quit(-1,"Must specify a value for n. Use the -n switch.\n");
00120 }
00121
00122 if (m == 0) {
00123 quit(-1,"Must specify a value for m. Use the -m switch.\n");
00124 }
00125
00126 if (n<=m) {
00127 quit(-1,"n must be greater than m.\n");
00128 }
00129
00130 pc_report_unk_args(&argc,argv,verbosity);
00131
00132 if (input_type == BINARY || input_type == ASCII) {
00133 storage_type = NUMERIC;
00134 }
00135 else {
00136 storage_type = ALPHA;
00137 }
00138
00139 if (storage_type == NUMERIC) {
00140 current_ngram_int = (unsigned short *)
00141 rr_malloc(n*sizeof(unsigned short));
00142 previous_ngram_int = (unsigned short *)
00143 rr_malloc(n*sizeof(unsigned short));
00144
00145
00146
00147 current_ngram_text = NULL;
00148 previous_ngram_text = NULL;
00149 }
00150 else {
00151 current_ngram_text = (char **) rr_malloc(n*sizeof(char *));
00152 previous_ngram_text = (char **) rr_malloc(n*sizeof(char *));
00153 for (i=0;i<=n-1;i++) {
00154 current_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
00155 previous_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
00156 }
00157
00158
00159
00160 current_ngram_int = NULL;
00161 previous_ngram_int = NULL;
00162
00163 }
00164
00165 got_to_eof = 0;
00166 first_one = 1;
00167
00168 while (!rr_feof(stdin)) {
00169
00170
00171
00172 if (!first_one) {
00173
00174 if (storage_type == NUMERIC) {
00175 for (i=0;i<=n-1;i++) {
00176 previous_ngram_int[i] = current_ngram_int[i];
00177 }
00178 }
00179 else {
00180 for (i=0;i<=n-1;i++) {
00181 strcpy(previous_ngram_text[i],current_ngram_text[i]);
00182 }
00183 }
00184
00185 }
00186
00187
00188
00189 switch(input_type) {
00190 case BINARY:
00191 for (i=0;i<=n-1;i++) {
00192 rr_fread(¤t_ngram_int[i],sizeof(id__t),1,stdin,
00193 "from id_ngrams at stdin",0);
00194 }
00195 rr_fread(¤t_count,sizeof(count_t),1,stdin,
00196 "from id_ngrams file at stdin",0);
00197 break;
00198 case ASCII:
00199 for (i=0;i<=n-1;i++) {
00200 if (fscanf(stdin,"%hu",¤t_ngram_int[i]) != 1) {
00201 if (!rr_feof(stdin)) {
00202 quit(-1,"Error reading id_ngram.\n");
00203 }
00204 else {
00205 got_to_eof = 1;
00206 }
00207 }
00208 }
00209 if (fscanf(stdin,"%d",¤t_count) != 1) {
00210 if (!rr_feof(stdin)) {
00211 quit(-1,"Error reading id_ngram.\n");
00212 }
00213 else {
00214 got_to_eof = 1;
00215 }
00216 }
00217 break;
00218 case WORDS:
00219 for (i=0;i<=n-1;i++) {
00220 if (fscanf(stdin,"%s",current_ngram_text[i]) != 1) {
00221 if (!rr_feof(stdin)) {
00222 quit(-1,"Error reading id_ngram.\n");
00223 }
00224 else {
00225 got_to_eof = 1;
00226 }
00227 }
00228 }
00229 if (fscanf(stdin,"%d",¤t_count) != 1) {
00230 if (!rr_feof(stdin)) {
00231 quit(-1,"Error reading id_ngram.\n");
00232 }
00233 else {
00234 got_to_eof = 1;
00235 }
00236 }
00237 break;
00238 }
00239
00240 if (!got_to_eof) {
00241
00242
00243
00244 if (!first_one) {
00245
00246 switch(storage_type) {
00247 case NUMERIC:
00248 for (i=0;i<=n-1;i++) {
00249 if (current_ngram_int[i]<previous_ngram_int[i]) {
00250 quit(-1,"Error : ngrams not correctly sorted.\n");
00251 }
00252 else {
00253 if (current_ngram_int[i]>previous_ngram_int[i]) {
00254 i=n;
00255 }
00256 }
00257 }
00258 break;
00259 case ALPHA:
00260 for (i=0;i<=n-1;i++) {
00261 if (strcmp(current_ngram_text[i],previous_ngram_text[i])<0) {
00262 quit(-1,"Error : ngrams not correctly sorted.\n");
00263 }
00264 else {
00265 if (strcmp(current_ngram_text[i],previous_ngram_text[i])>0) {
00266 i=n;
00267 }
00268 }
00269 }
00270 break;
00271 }
00272 }
00273
00274
00275
00276 if (!first_one) {
00277
00278 switch(storage_type) {
00279 case NUMERIC:
00280 same = 1;
00281 for (i=0;i<=m-1;i++) {
00282 if (current_ngram_int[i] != previous_ngram_int[i]) {
00283 same = 0;
00284 }
00285 }
00286 if (same) {
00287 running_total += current_count;
00288 }
00289 else {
00290 if (input_type == ASCII) {
00291 for (i=0;i<=m-1;i++) {
00292 printf("%d ",previous_ngram_int[i]);
00293 }
00294 printf("%d\n",running_total);
00295 }
00296 else {
00297 for (i=0;i<=m-1;i++) {
00298 rr_fwrite(&previous_ngram_int[i],sizeof(id__t),1,stdout,
00299 "to id_ngrams at stdout");
00300 }
00301 rr_fwrite(&running_total,sizeof(count_t),1,stdout,
00302 "to id n-grams at stdout");
00303 }
00304 running_total = current_count;
00305 }
00306 break;
00307 case ALPHA:
00308 same = 1;
00309 for (i=0;i<=m-1;i++) {
00310 if (strcmp(current_ngram_text[i],previous_ngram_text[i])) {
00311 same = 0;
00312 }
00313 }
00314 if (same) {
00315 running_total += current_count;
00316 }
00317 else {
00318 for (i=0;i<=m-1;i++) {
00319 printf("%s ",previous_ngram_text[i]);
00320 }
00321 printf("%d\n",running_total);
00322 running_total = current_count;
00323
00324 }
00325 break;
00326 }
00327
00328 }
00329 else {
00330 running_total = current_count;
00331 }
00332
00333 first_one = 0;
00334
00335 }
00336 }
00337
00338
00339
00340 switch(input_type) {
00341 case BINARY:
00342 break;
00343 case ASCII:
00344 for (i=0;i<=m-1;i++) {
00345 printf("%d ",previous_ngram_int[i]);
00346 }
00347 printf("%d\n",running_total);
00348 break;
00349 case WORDS:
00350 for (i=0;i<=m-1;i++) {
00351 printf("%s ",previous_ngram_text[i]);
00352 }
00353 printf("%d\n",running_total);
00354 break;
00355 }
00356
00357 pc_message(verbosity,0,"ngram2mgram : Done.\n");
00358
00359 exit(0);
00360
00361 }