#!/usr/bin/env python
# Time-stamp: <2006-11-13 15:16:58 poser>
#
# Copyright (C) 2006 Linguistic Data Consortium
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is contained in the
# procedure "License" in this file.
# If it is not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
# or go to the web page:  http://www.gnu.org/licenses/gpl.txt.

# Filter to convert the Indian Government PASCII encoding to UTF-8 Unicode.
# Note that PASCII codepoints 0xD3-0xD7 have no Unicode equivalents.
# These are all abbreviations for pious phrases, not characters of the
# usual sort.
#
# The interpretation of 0xC4 and 0xC5, labelled "hamza above (Kashmiri)" and
# "hamza below (Kashmiri)" is unknown.

# Author: Bill Poser (wjposer@ldc.upenn.edu)
# Version: 1.1

import sys
import codecs

Version = '1.1'

argc = len(sys.argv)
if (argc > 1) and (sys.argv[1] == "-v"):
    sys.stdout.write("Pascii2Uni %s\n" % (Version))
    sys.stdout.write("Copyright 2006 Linguistic Data Consortium\n")
    sys.stdout.write("Released under the terms of the GNU General Public License.\n")
    sys.exit(2)

if (argc > 1) and (sys.argv[1] == "-h"):
    sys.stderr.write("This program is a filter that converts from PASCII, the Indian Government\nStandard for Arabic, Kashmiri, Sindhi, and Urdu, to UTF-8 Unicode.\n")
    sys.exit(2)

PasciiUniMap = [\
u"\u0000", #0x00
u"\u0001",
u"\u0002",
u"\u0003",
u"\u0004",
u"\u0005",
u"\u0006",
u"\u0007",
u"\u0008",
u"\u0009",
u"\u000A",
u"\u000B",
u"\u000C",
u"\u000D",
u"\u000E",
u"\u000F",
u"\u0010", #0x10
u"\u0011",
u"\u0012",
u"\u0013",
u"\u0014",
u"\u0015",
u"\u0016",
u"\u0017",
u"\u0018",
u"\u0019",
u"\u001A",
u"\u001B",
u"\u001C",
u"\u001D",
u"\u001E",
u"\u001F",
u"\u0020", #0x20
u"\u0021",
u"\u0022",
u"\u0023",
u"\u0024",
u"\u0025",
u"\u0026",
u"\u0027",
u"\u0028",
u"\u0029",
u"\u002A",
u"\u002B",
u"\u002C",
u"\u002D",
u"\u002E",
u"\u002F",
u"\u0030", #0x30
u"\u0031",
u"\u0032",
u"\u0033",
u"\u0034",
u"\u0035",
u"\u0036",
u"\u0037",
u"\u0038",
u"\u0039",
u"\u003A",
u"\u003B",
u"\u003C",
u"\u003D",
u"\u003E",
u"\u003F",
u"\u0040", #0x40
u"\u0041", 
u"\u0042",
u"\u0043",
u"\u0044",
u"\u0045",
u"\u0046",
u"\u0047",
u"\u0048",
u"\u0049",
u"\u004A",
u"\u004B",
u"\u004C",
u"\u004D",
u"\u004E",
u"\u004F",
u"\u0050",#0x50
u"\u0051",
u"\u0052",
u"\u0053",
u"\u0054",
u"\u0055",
u"\u0056",
u"\u0057",
u"\u0058",
u"\u0059",
u"\u005A",
u"\u005B",
u"\u005C",
u"\u005D",
u"\u005E",
u"\u005F",
u"\u0060", #0x60
u"\u0061",
u"\u0062",
u"\u0063",
u"\u0064",
u"\u0065",
u"\u0066",
u"\u0067",
u"\u0068",
u"\u0069",
u"\u006A",
u"\u006B",
u"\u006C",
u"\u006D",
u"\u006E",
u"\u006F",
u"\u0070", #0x70
u"\u0071",
u"\u0072",
u"\u0073",
u"\u0074",
u"\u0075",
u"\u0076",
u"\u0077",
u"\u0078",
u"\u0079",
u"\u007A",
u"\u007B",
u"\u007C",
u"\u007D",
u"\u007E",
u"\u007F",
u"\uFFFD", #0x80
u"\u0640",
u"\u0627",
u"\u0622",
u"\u0628",
u"\u067B",
u"\u0680",
u"\u067E",
u"\u06A6",
u"\u062A",
u"\u0629",
u"\u067F",
u"\u067D",
u"\u0679",
u"\u062B",
u"\u062C",
u"\u0684", #0x90
u"\u0683",
u"\u0686",
u"\u0687",
u"\u062D",
u"\u062E",
u"\u062F",
u"\u068C",
u"\u0688",
u"\u068F",
u"\u068D",
u"\u0630",
u"\u0631",
u"\u0691",
u"\u0691\u06BE",
u"\u0632",
u"\u0698", #0xA0
u"\u0633",
u"\u0634",
u"\u0635",
u"\u0636",
u"\u0637",
u"\u0638",
u"\u0639",
u"\u063A",
u"\u0641",
u"\u0642",
u"\u06A9",
u"\u06AA",
u"\u06AF",
u"\u06B3",
u"\u06B1",
u"\u0644", #0xB0
u"\u0645",
u"\u0646",
u"\u06BA",
u"\u06BB",
u"\u0648",
u"\u06C4",
u"\u06C1",
u"\u06BE",
u"\u0621",
u"\u06CC",
u"\uFFFD",
u"\uFFFD",
u"\u06D2",
u"\u064E",
u"\u0650",
u"\u064F", #0xC0
u"\u0657",
u"\u0654",
u"\u0655",
u"\uFFFD",
u"\uFFFD",
u"\u0651",
u"\u0653",
u"\u0652",
u"\u0670",
u"\u0656",
u"\u0671",
u"\uFFFD",
u"\u0614",
u"\u060C",
u"\u0610",
u"\u0613", #0xD0
u"\u0612",
u"\u0611",
u"\uFFFD",
u"\uFFFD",
u"\uFFFD",
u"\uFFFD",
u"\uFFFD",
u"\u0601",
u"\u066A",
u"\u060D",
u"\u066B",
u"\u06F0",
u"\u06F1",
u"\u06F2",
u"\u06F3",
u"\u06F4", #0xE0
u"\u06F5",
u"\u06F6",
u"\u06F7",
u"\u06F8",
u"\u06F9",
u"\u0021",
u"\u201C",
u"\u201D",
u"\u2018",
u"\u2019",
u"\u0028",
u"\u0029",
u"\u002A",
u"\u002B",
u"\uFFFD",
u"\u002D", #0xF0
u"\u002F",
u"\u003B",
u"\u003A",
u"\u061F",
u"\u003C",
u"\u06D4",
u"\u25CB",
u"\u25CF",
u"\u066C",
u"\uFFFD",
u"\uFFFD",
u"\u25CC",
u"\u00B7",
u"\uFFFD",
u"\uFFFD"]

outfile = codecs.getwriter('utf-8')(sys.stdout)
CharCnt = 0
while 1:
    try:
        c = sys.stdin.read(1)
        if not c:
            sys.exit(0)
        CharCnt+=1
        co = ord(c)
        if co == 0xE7: #skip ATR and following byte
            c = sys.stdin.read(1)
            if not c:
                sys.stderr.write("Ill-formed input: ATR with no following byte.\n")
                sys.exit(3)
        continue
       if co == 0xF0: #skip EXT
           continue
       outfile.write(PasciiUniMap[co])
    except SystemExit:
        sys.exit(0)
    except IOError:
        sys.exit(0)
    except:
        sys.stderr.write("Error reading input after character %d.\n" % (CharCnt))
        sys.exit(2)
sys.exit(0)
