#!/usr/local/bin/perl

# take a SGML file and divide it, putting one document per file.
# use the DOCNO field to name the file.

#

$heads_only = 0;
if ($#ARGV < 0 ) { die("usage: divide [-h] <sgmlfile>\n");}
if ($#ARGV == 1) { if ("-h" eq shift) { $heads_only = 1; }}
print STDERR "Processing $ARGV[0]...\n";
open(IN,$ARGV[0]) || die("Can't open $ARGV[0]!\n");
$count = 0;

while ($_ = <IN>) {
  if (/^<DOC>.*/ ) {
    $i = 0; $id = ""; $print_head = 0;
    $count++;
    $doc[$i++] = $_;
    while ( $_ = <IN> ) {
      $doc[$i++] = $_ ; 
      # print out the file names. suitable for use as a .ctl file
      if ( /^<DOCNO>.*/ ) { ($b,$id,$e) = split(/\s/,$_); print $id."\n"; }

      # print out the heads, as a categorization aid
      if (/^<1ST_LINE>.*/) {print STDERR "\n",$id,": ",$_;}
      if ( /^<HEAD>.*/ ) {$print_head = 1;}
      if ($print_head) { print STDERR $id,": ",$_; }
      if ( /.*<\/HEAD>.*/ ) {$print_head = 0;}

      if ( /^<\/DOC>.*/ ) { last; }
    }
  if (!$heads_only) {
    if ( $id ne "" ) { open(OUT,">".$id); print OUT @doc; close(OUT); } 
    else
    { print STDERR "Warning! did not find a DOCNO in document $count."; }
  }
  @doc = 0;
  }
}

#


  
    