/************************************************************************/
/*                                                                      */
/*	LangIdent: long n-gram-based language identification		*/
/*	by Ralf Brown / Carnegie Mellon University			*/
/*									*/
/*  File:     dehtmlize.C  convert &#NNN; to UTF-8			*/
/*  Version:  1.25							*/
/*  LastEdit: 04oct2014 						*/
/*                                                                      */
/*  (c) Copyright 2012,2014 Ralf Brown/Carnegie Mellon University	*/
/*      This program is free software; you can redistribute it and/or   */
/*      modify it under the terms of the GNU General Public License as  */
/*      published by the Free Software Foundation, version 3.           */
/*                                                                      */
/*      This program is distributed in the hope that it will be         */
/*      useful, but WITHOUT ANY WARRANTY; without even the implied      */
/*      warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR         */
/*      PURPOSE.  See the GNU General Public License for more details.  */
/*                                                                      */
/*      You should have received a copy of the GNU General Public       */
/*      License (file COPYING) along with this program.  If not, see    */
/*      http://www.gnu.org/licenses/                                    */
/*                                                                      */
/************************************************************************/

#include <cstdlib>
#include <cstdio>
#include <ctype.h>
#include <memory.h>

#define MAX_ENTITY 6 // longest alphabetic name for any HTML char entity

/************************************************************************/
/************************************************************************/

enum State
   {
      ST_Normal,
      ST_Ampersand,
      ST_Numeric,
      ST_HexNumeric,
      ST_Alpha
   } ;

//----------------------------------------------------------------------

struct HTML_Entity
   {
   public:
      const char *name ;
      unsigned codepoint ;
   } ;

/************************************************************************/
/************************************************************************/

static HTML_Entity HTML_entities[] =
   {
      { "quot",		0x0022 },
      { "amp",		0x0026 },
      { "apos",		0x0027 },
      { "lt",		0x003C },
      { "gt",		0x003E },
      { "nbsp",		0x00A0 },
      { "iexcl",	0x00A1 },
      { "cent",		0x00A2 },
      { "pound",	0x00A3 },
      { "curren",	0x00A4 },
      { "yen",		0x00A5 },
      { "brvbar",	0x00A6 },
      { "sect",		0x00A7 },
      { "uml",		0x00A8 },
      { "copy",		0x00A9 },
      { "ordf",		0x00AA },
      { "laquo",	0x00AB },
      { "not",		0x00AC },
      { "shy",		0x00AD },
      { "reg",		0x00AE },
      { "macr",		0x00AF },
      { "degr",		0x00B0 },
      { "plusmn",	0x00B1 },
      { "sup2",		0x00B2 },
      { "sup3",		0x00B3 },
      { "acute",	0x00B4 },
      { "micro",	0x00B5 },
      { "para",		0x00B6 },
      { "middot",	0x00B7 },
      { "cedil",	0x00B8 },
      { "sup1",		0x00B9 },
      { "ordm",		0x00BA },
      { "raquo",	0x00BB },
      { "frac14",	0x00BC },
      { "frac12",	0x00BD },
      { "frac34",	0x00BE },
      { "iquest",	0x00BF },
      { "Agrave",	0x00C0 },
      { "Aacute",	0x00C1 },
      { "Acirc",	0x00C2 },
      { "Atilde",	0x00C3 },
      { "Auml",		0x00C4 },
      { "Aring",	0x00C5 },
      { "AElig",	0x00C6 },
      { "Ccedil",	0x00C7 },
      { "Egrave",	0x00C8 },
      { "Eacute",	0x00C9 },
      { "Ecirc",	0x00CA },
      { "Euml",		0x00CB },
      { "Igrave",	0x00CC },
      { "Iacute",	0x00CD },
      { "Icirc",	0x00CE },
      { "Iuml",		0x00CF },
      { "ETH",		0x00D0 },
      { "Ntilde",	0x00D1 },
      { "Ograve",	0x00D2 },
      { "Oacute",	0x00D3 },
      { "Ocirc",	0x00D4 },
      { "Otilde",	0x00D5 },
      { "Ouml",		0x00D6 },
      { "times",	0x00D7 },
      { "Oslash",	0x00D8 },
      { "Ugrave",	0x00D9 },
      { "Uacute",	0x00DA },
      { "Ucirc",	0x00DB },
      { "Uuml",		0x00DC },
      { "Yacute",	0x00DD },
      { "THORN",	0x00DE },
      { "szlig",	0x00DF },

      { "agrave",	0x00E0 },
      { "aacute",	0x00E1 },
      { "acirc",	0x00E2 },
      { "atilde",	0x00E3 },
      { "auml",		0x00E4 },
      { "aring",	0x00E5 },
      { "aelig",	0x00E6 },
      { "ccedil",	0x00E7 },
      { "egrave",	0x00E8 },
      { "eacute",	0x00E9 },
      { "ecirc",	0x00EA },
      { "euml",		0x00EB },
      { "igrave",	0x00EC },
      { "iacute",	0x00ED },
      { "icirc",	0x00EE },
      { "iuml",		0x00EF },
      { "eth",		0x00F0 },
      { "ntilde",	0x00F1 },
      { "ograve",	0x00F2 },
      { "oacute",	0x00F3 },
      { "ocirc",	0x00F4 },
      { "otilde",	0x00F5 },
      { "ouml",		0x00F6 },
      { "divide",	0x00F7 },
      { "oslash",	0x00F8 },
      { "ugrave",	0x00F9 },
      { "uacute",	0x00FA },
      { "ucirc",	0x00FB },
      { "uuml",		0x00FC },
      { "yacute",	0x00FD },
      { "thorn",	0x00FE },
      { "yuml",		0x00FF },
      { "OElig",	0x0152 },
      { "oelig",	0x0153 },
      { "Scaron",	0x0160 },
      { "scaron",	0x0161 },
      { "Yuml",		0x0178 },
      { "fnof",		0x0192 },
      { "circ",		0x02C6 },
      { "tilde",	0x02DC },
      { "Alpha",	0x0391 },
      { "Beta",		0x0392 },
      //...
      { "Omega",	0x03A9 },
      { "alpha",	0x03B1 },
      //...
      { "omega",	0x03C9 },
      { "ensp",		0x2002 },
      { "emsp",		0x2003 },
      { "thinsp",	0x2009 },
      { "ndash",	0x2013 },
      { "mdash",	0x2014 },
      { "lsquo",	0x2018 },
      { "rsquo",	0x2019 },
      { "sbquo",	0x201A },
      { "ldquo",	0x201C },
      { "rdquo",	0x201D },
      { "bdquo",	0x201E },
      { "dagger",	0x2020 },
      { "Dagger",	0x2021 },
      { "bull",		0x2022 },
      { "hellip",	0x2026 },
      { "permil",	0x2030 },
      { "prime",	0x2032 },
      { "Prime",	0x2033 },
      { "lsaquo",	0x2039 },
      { "rsaquo",	0x203A },
      { "oline",	0x203E },
      { "frasl",	0x2044 },
      { "euro",		0x20AC },
      //...
      { "lang",		0x2329 },
      { "rang",		0x232A },
      { "loz",		0x25CA },
      { "spades",	0x2660 },
      { "clubs",	0x2663 },
      { "hearts",	0x2665 },
      { "diams",	026666 },
      { 0, 0 }	// sentinel for end of list
   } ;

/************************************************************************/
/************************************************************************/

static void write_utf8(unsigned long codepoint, FILE *outfp)
{
   if (codepoint < 0x80)
      fputc((char)codepoint,outfp) ;
   else if (codepoint < 0x800)
      {
      fputc((char)(0xC0 | ((codepoint >> 6) & 0x1F)), outfp) ;
      fputc((char)(0x80 | (codepoint & 0x3F)), outfp) ;
      }
   else if (codepoint < 0x10000)
      {
      fputc((char)(0xE0 | ((codepoint >> 12) & 0x0F)), outfp) ;
      fputc((char)(0x80 | ((codepoint >> 6) & 0x3F)), outfp) ;
      fputc((char)(0x80 | (codepoint & 0x3F)), outfp) ;
      }
   else if (codepoint < 0x10FFFF)
      {
      fputc((char)(0xF0 | ((codepoint >> 18) & 0x07)), outfp) ;
      fputc((char)(0x80 | ((codepoint >> 12) & 0x3F)), outfp) ;
      fputc((char)(0x80 | ((codepoint >> 6) & 0x3F)), outfp) ;
      fputc((char)(0x80 | (codepoint & 0x3F)), outfp) ;
      }
   else
      fputc('?',stdout) ;
   return ;
}

//----------------------------------------------------------------------

static bool start_of(const HTML_Entity *entities, int ch)
{
   if (!entities)
      return false ;
   while (entities->name)
      {
      if (entities->name[0] == ch)
	 return true ;
      entities++ ;
      }
   return false ;
}

//----------------------------------------------------------------------

static bool write_entity_char(const HTML_Entity *entities, const char *name, unsigned name_len)
{
   if (!entities)
      return false ;
   while (entities->name)
      {
      if (memcmp(entities->name,name,name_len) == 0)
	 {
	 write_utf8(entities->codepoint,stdout) ;
	 return true ;
	 }
      entities++ ;
      }
   return false ;
}

//----------------------------------------------------------------------

static void write_string(const char *str, unsigned len, FILE *fp)
{
   if (!str)
      return ;
   for (size_t i = 0 ; i < len ; i++)
      fputc(str[i],fp) ;
   return ;
}

//----------------------------------------------------------------------

static void usage(const char *argv0)
{
   fprintf(stderr,"Usage: %s <in >out\n",argv0) ;
   fprintf(stderr,"  convert &#nnn; entities to actual UTF-8 characters\n") ;
   return ;
}

//----------------------------------------------------------------------

int main(int argc, char **argv)
{
   if (argc > 1)
      {
      usage(argv[0]) ;
      return 1;
      }
   State state = ST_Normal ;
   unsigned long codepoint = 0 ;
   char entity_name[MAX_ENTITY+1] ;
   unsigned entity_name_len = 0 ;
   while (!feof(stdin))
      {
      int ch = fgetc(stdin) ;
      if (ch == EOF)
	 break ;
      switch (state)
	 {
	 case ST_Normal:
	    if (ch == '&')
	       state = ST_Ampersand ;
	    else
	       fputc(ch,stdout) ;
	    break ;
	 case ST_Ampersand:
	    if (ch == '#')
	       {
	       state = ST_Numeric ;
	       codepoint = 0 ;
	       }
	    else if (start_of(HTML_entities,ch))
	       {
	       state = ST_Alpha ;
	       entity_name[0] = (char)ch ;
	       entity_name_len = 1 ;
	       }
	    else
	       {
	       fputc('&',stdout) ;
	       fputc(ch,stdout) ;
	       state = ST_Normal ;
	       }
	    break ;
	 case ST_Numeric:
	    if (ch == 'x' || ch == 'X')
	       {
	       if (codepoint == 0)
		  state = ST_HexNumeric ;
	       else
		  {
		  write_utf8(codepoint,stdout) ;
		  fputc(ch,stdout) ;
		  state = ST_Normal ;
		  }
	       }
	    if (ch >= '0' && ch <= '9')
	       codepoint = 10 * codepoint + (ch - '0') ;
	    else
	       {
	       write_utf8(codepoint,stdout) ;
	       if (ch != ';')
		  fputc(ch,stdout) ;
	       state = ST_Normal ;
	       }
	    break ;
	 case ST_HexNumeric:
	    if (ch >= '0' && ch <= '9')
	       codepoint = (16 * codepoint) + (ch - '0') ;
	    else if (tolower(ch) >= 'a' && tolower(ch) <= 'f')
	       codepoint = (16 * codepoint) + (tolower(ch) - 'a' + 10) ;
	    else
	       {
	       write_utf8(codepoint,stdout) ;
	       if (ch != ';')
		  fputc(ch,stdout) ;
	       state = ST_Normal ;
	       }
	    break ;
	 case ST_Alpha:
	    if (ch == ';' || entity_name_len >= MAX_ENTITY)
	       {
	       if (!write_entity_char(HTML_entities,entity_name,entity_name_len))
		  {
		  write_string(entity_name,entity_name_len,stdout) ;
		  }
	       if (ch != ';')
		  fputc(ch,stdout) ;
	       state = ST_Normal ;
	       }
	    else
	       {
	       entity_name[entity_name_len++] = (char)ch ;
	       }
	    break ;
	 default:
	    return 1 ;
	 }
      }
   return 0 ;
}
