#!/usr/local/bin/perl

# takes a SGML file and divide it, putting one document per file.
# uses the DOCNO field to name the file.
# if a <keepfile> is provided, then only for those doc'ids noted in 
# that file will a document file be produced. 

#

$heads_only = 0;
if ($#ARGV < 0 ) { die("usage: divide [-h] <sgmlfile> <keepfile>\n");}
if ($#ARGV >= 0) { if ("-h" eq $ARGV[0]) { shift ; $heads_only = 1; }}

$inputfile = $ARGV[0];
$keepfile  = $ARGV[1];

# find the raw text input file
open(IN, $inputfile) || die("Can't open $inputfile!\n");
$count = 0;

# read the list of files to preserve
open (LIST, $keepfile) ||die("Can't open $keepfile!\n");
while (<LIST>) {
  ($f,$a,$b) = split(/\t/);
  $list{$f} = $a;
}
close(LIST);

print STDERR "Processing $inputfile...\n";

while ($_ = <IN>) {
  if (/^<DOC>.*/ ) {
    $i = 0; $id = ""; $print_head = 0;
    $count++;
    $doc[$i++] = $_;
    while ( $_ = <IN> ) {
      $doc[$i++] = $_ ; 
      if ( /^<DOCNO>.*/ ) { ($b,$id,$e) = split(/\s/,$_); }

      # print out the heads, as a categorization aid
      if (/^<1ST_LINE>.*/) {print "\n", $id,": ",$_;}
      if ( /^<HEAD>.*/ ) {$print_head = 1;}
      if ($print_head) { print $id ": ",$_; }
      if ( /.*<\/HEAD>.*/ ) {$print_head = 0;}

      if ( /^<\/DOC>.*/ ) { last; }
    }
  if (!$heads_only) {
    if ( ($id ne "") && ($list{$id} ne "") ) { 
      open(OUT,">".$id); 
      print OUT @doc; close(OUT); 
      print STDERR "found $id\n";
    } 
    else
    { if ($id eq "") {
      print STDERR "Warning! did not find a DOCNO in document $count."; 
    }}
  }
  @doc = 0;
  }
}

#


  
    