#!/usr/bin/ruby

=begin

usage: ./align.rb <bitext> <L0-vocab> <L1-vocab> <dict>

  The script expects the input data on stdin,
  in the following format:

    <tokens in L0>
    <tokens in L1>
    [<pair> ..]
    [repeat]

  The script writes the transformed sentence pairs
  to <bitext>, the vocabularies to <L*-vocab>, and
  the dictionary to <dict>.

=end

require 'set'

# check args
if(ARGV.length != 4)
  $stderr.puts "usage: ./align.rb <bitext> <L0-vocab> <L1-vocab> <dict>"
  exit 1
end

# work on standard input
while(!$stdin.eof?)
  # read in triplet
  begin
    tokens0 = $stdin.gets.split
    tokens1 = $stdin.gets.split
    aligns  = $stdin.gets.split
  rescue
    $stderr.puts "error: expecting triplets"
    exit 1
  end
  # check that there is even # of alignments
  if((aligns.size % 2) != 0)
    $stderr.puts "error: uneven alignment pair"
    exit 1
  end
  # for each pair of alignments
  aligns_copy = aligns
  mapped_to_0 = Set.new
  mapped_to_1 = Set.new
  while(!aligns_copy.empty?)
    a0 = aligns_copy.shift
    a1 = aligns_copy.shift
    # ensure that contains only 1-1 mappings
    if(mapped_to_0.include?(a0) or mapped_to_1.include?(a1))
      $stderr.puts "error: non 1-1 mapping"
      exit 1
    end
    mapped_to_0.add(a0)
    mapped_to_1.add(a1)
    # output bitext
    
  end
end