/*---------------------------------------------------------- FILE meaning.C OVERVIEW Lexifier program code LOG
	$Log: meaning.C,v $
	Revision 1.82  2005/10/19 15:00:17  svoboda
	Removed memtrays
	
	Revision 1.81  2005/06/28 19:47:37  svoboda
	Added token offsets to FS from lexifier
	
	Revision 1.80  2005/02/24 18:59:19  svoboda
	Several Windows portability changes s/string.H/kstring.H/g s/out/tout/
	
	Revision 1.79  2003/04/30 19:26:49  svoboda
	Now produces complete token sequences for S with rogue literal tag
	
	Revision 1.78  2003/01/13 18:04:07  svoboda
	Incorporated md5i's STD C++ compatability; now compilues under g++ 3.0
	
	Revision 1.71.2.3  2002/11/14 21:27:55  md5i
	Merge with trunk.
	
	Revision 1.77  2002/10/15 20:29:27  svoboda
	Adjustment to fix 507...kce-minus POSes not added to alternate-cats slot
	
	Revision 1.76  2002/10/14 17:20:39  svoboda
	Alternate-cats is now assigned to kce-minus terms
	
	Revision 1.75  2002/10/07 21:06:18  svoboda
	Fixed bug 488
	
	Revision 1.74  2002/09/27 16:47:11  svoboda
	Changed CTE to KCE for DMK & FS
	
	Revision 1.73  2002/04/16 13:40:16  svoboda
	Fixed 'hardened steel ball' regression - bug 393
	
	Revision 1.72  2002/04/10 16:21:25  svoboda
	Added diagnostics messages to Analyzer
	
	Revision 1.71.2.2  2002/06/18 20:28:33  md5i
	Merge in HEAD.
	
	Revision 1.71.2.1  2002/03/19 18:08:43  md5i
	Updates for STD C++.
	
	Revision 1.71  2002/02/11 23:11:14  svoboda
	Patrick_Statement_Lists must NOT be initialized at startup. AIX hates that
	
	Revision 1.70  2002/02/11 20:17:40  svoboda
	Oops, true is now a valid keyword...except on AIX
	
	Revision 1.69  2001/11/29 21:29:07  svoboda
	Added head-root slot, as per Bug 290
	
	Revision 1.68  2001/10/22 20:39:45  svoboda
	OK, overhauled unify_defs's initialization routines to be consistent
	
	Revision 1.67  2001/10/19 21:38:22  svoboda
	Bug 279 fix...don't unify idioms with words if no idioms
	
	Revision 1.66  2001/10/16 18:09:15  svoboda
	Bug 269 - unify_defs unifies class iff Idiom class not defined
	
	Revision 1.65  2001/10/15 19:04:38  svoboda
	unify-defs once again unifies semantic verb features (Bug 262)
	
	Revision 1.64  2001/07/05 16:14:33  svoboda
	Idiom-word inheritance no longer inherits scomp, xcomp, or apcomp (fix for Bug 156)
	
	Revision 1.63  2000/11/03 20:35:01  svoboda
	Adopted simpler output format for ambiguities under KCE. CTE outputs unchanged
	
	Revision 1.62  2000/10/26 18:30:59  svoboda
	Changed CTE to KCE for DMK slot and for disambig tags
	
	Revision 1.61  2000/09/19 13:36:54  svoboda
	Bugfix...final period may still also be part of an abbrev

	Revision 1.60  2000/09/05 14:12:22  svoboda
	Character entities may no longer be considered meaningless tokens

	Revision 1.59  2000/08/29 16:17:54  svoboda
	Bugfix, single punct chars may no longer be Meaningless_Tokens

	Revision 1.58  2000/04/14 12:57:25  svoboda
	Definition-unification now ignores abbreviations & acronyms

	Revision 1.57  2000/03/14 12:09:37  svoboda
	Analyzer now reports meaningless tokens

	Revision 1.56  1999/12/03 13:58:33  svoboda
	KANTOO now produces log output formatted as outlines

	Revision 1.55  1999/10/07 13:33:48  svoboda
	SMCS codes no longer included when unifying words & idioms

	Revision 1.54  1999/09/29 17:14:53  svoboda
	Bugfix...pass strings to morphonizer dequoted

	Revision 1.53  1999/09/28 14:30:54  svoboda
	Quick bugfix to hash wildcard feature

	Revision 1.52  1999/09/28 13:39:03  svoboda
	PR 5991 - added wildcard 'hash' symbols

	Revision 1.51  1999/07/27 14:03:39  svoboda
	Unify-definitions no longer prunes out idioms if their head has no matching POS

	Revision 1.50  1999/07/19 12:09:38  svoboda
	Oops, literals were still getting added

	Revision 1.49  1999/07/16 12:30:30  svoboda
	Eliminated numberizer/tagifier & fixed some memory leaks

	Revision 1.48  1999/07/08 18:59:53  svoboda
	Removed Numberizer and Tagifier...made obsolete by more powerful morphology

	Revision 1.47  1999/07/03 12:02:25  svoboda
	Moved is_int, etc. from Symbol_Frame to KString...where they belong

	Revision 1.46  1999/07/02 18:59:23  svoboda
	Symbol_Frames now replace Atomic_Frames, Boolean_Frames, and Integer_Frames

	Revision 1.45  1999/06/17 14:29:52  svoboda
	Symbols are now implemented with Frames, and like Frames

	Revision 1.44  1999/05/11 12:53:27  md5i
	Changes to allow symbols to be created staticly despite memtrays.

	Revision 1.43  1999/03/22 13:04:21  md5i
	Fencepost error fixed.

	Revision 1.42  1999/03/19 17:12:59  md5i
	Use all-heads to fix head-pos values in ?CTE attach tags.

	Revision 1.41  1999/03/16 15:21:34  md5i
	Memory leak fix.

	Revision 1.40  1999/03/15 16:11:25  md5i
	Memory fixes.

	Revision 1.39  1999/03/01 16:12:29  svoboda
	Added SMCS Codes

	Revision 1.38  1999/01/29 10:45:49  svoboda
	Made zero-numbering optional for units

	Revision 1.37  1998/12/16 18:03:03  svoboda
	Made fuzzy head-pos go both ways

	Revision 1.36  1998/12/10 18:01:44  svoboda
	Added disgusting fuzzy_head_pos hack in to Lexifier

	Revision 1.35  1998/11/23 09:39:23  md5i
	Merged Memory_Trays into main distribution.

	Revision 1.34  1998/11/12 17:29:11  svoboda
	Bugfixes: missing-bug, empty-literal-tags bug

	Revision 1.33  1998/11/06 17:41:43  svoboda
	DMK_Translator now points out invalid symbols in DMK

	Revision 1.32  1998/11/02 15:46:21  svoboda
	Syntaxifier now takes a graph of inputs, not a sequence

	Revision 1.31.2.1  1998/11/13 17:45:18  md5i
	Added memtrays.

	Revision 1.31  1998/10/27 17:42:47  svoboda
	OK got unify_defs working right now.

	Revision 1.30  1998/10/26 18:59:00  svoboda
	Slight bug in unify-defs; allow non-noun idioms to unify correctly

	Revision 1.29  1998/10/13 19:32:02  svoboda
	Some bugfixes wrt idiom/word unification

	Revision 1.28  1998/10/08 18:26:09  svoboda
	Oops...numbers were getting wrongly identified in phrases

	Revision 1.27  1998/10/08 14:23:28  svoboda
	Removed (cte -) from words when unifying with idioms

	Revision 1.26  1998/10/07 16:59:28  svoboda
	Fixed bug wrt idiom meanings/orthos/roots being truncated to head

	Revision 1.25  1998/10/06 16:39:23  svoboda
	Idioms now are constructed from symbols, not strings

	Revision 1.24  1998/09/30 13:57:25  svoboda
	Bugfixes in definition unification

 * Revision 1.23  98/09/23  15:15:28  svoboda
 * Fixed literals bug
 * 
	Revision 1.22  1998/09/21 15:08:36  svoboda
	Added numeric-value to lexifier for idioms

 * Revision 1.21  98/09/11  12:36:52  md5i
 * Symbols are now based on Memvector_KStrings.
 * 
	Revision 1.20  1998/09/04 14:38:32  svoboda
	Converted Lexifier to speak PATRICK

	Revision 1.19  1998/08/03 16:37:12  svoboda
	Fixed alternate-cats to ignore phrases and non-CTE words

	Revision 1.18  1998/07/27 15:24:14  svoboda
	Adjusted ED/ING to be first-class alternate-cats

	Revision 1.17  1998/07/22 16:06:34  svoboda
	Added alternate-cats ing and ed

	Revision 1.16  1998/06/24 15:44:22  svoboda
	Defs unify if typeids match, not cats

	Revision 1.15  1998/06/23 17:16:14  svoboda
	Added alternate-cats slot

	Revision 1.14  1998/06/22 18:26:34  svoboda
	The unify-defs procedure handles several more features now

	Revision 1.13  1998/06/19 14:03:59  svoboda
	Many bugfixes, incl. idiom inheritance, separate Defs, / & ' handling

	Revision 1.12  1998/06/02 11:59:17  svoboda
	Added install.data for language-related files to install.

	Revision 1.11  1998/05/27 13:33:10  svoboda
	Plugged memory leaks

	Revision 1.10  1998/02/26 13:20:01  md5i
	Shared Library name no longer hard codes KANTOO environ.

	Revision 1.9  1998/01/13 10:52:48  md5i
	Numbers are now Noun_Definitions, as opposed to Real Definitions.

	Revision 1.8  1997/11/26 12:51:00  svoboda
	Made partial DMK image optimization

 * Revision 1.7  97/09/09  16:16:42  md5i
 * More hacking to get things working under g++ with optimization.
 * 
	Revision 1.6  1997/08/22 16:08:44  svoboda
	Fixed bug of interpreting '.' as a number

	Revision 1.5  1997/08/01 15:59:38  svoboda
	Created Shared_Library class, extended Rule_Database to use it, improved logs

 * Revision 1.4  97/06/05  19:18:06  svoboda
 * Redesigned unification, adding paths/slots/typeless grammar
 * 
 * Revision 1.3  97/06/01  16:34:03  svoboda
 * Added symbol table class
 * 
 * Revision 1.2  97/05/20  16:28:19  svoboda
 * Added protocol class, made modules less interdependent
 * 
 * Revision 1.1  97/05/13  13:22:04  svoboda
 * Added Lexifier
 *
	
AUTHOR David Svoboda Copyright (C) 1997 Carnegie Mellon All Rights Reserved ----------------------------------------------------------*/ #include "dmk_database.H" #include "deftypes.H" #include "lexifier.H" #include "morphonizer.H" #include "morphsemizer.H" #include "wordifier.H" #include "idiomifier.H" #include #include "capitalization.H" #include "idiom_dictary.H" #include "sequence_frame.H" #include "slot_frame.H" #include "patrick_statements.H" #include "smcs.H" const Symbol Lexifier::Root("root"); const Symbol Lexifier::Ortho("ortho"); const Symbol Lexifier::Head_Token("head-token"); const Symbol Head_Root("head-root"); const Symbol Lexifier::Attributes("attributes"); const Symbol Lexifier::Being("being"); const Symbol Lexifier::Alt_Cats("alternate-cats"); const Symbol Lexifier::Hash_Value("hash"); const Frame Lexifier::Number_Category("number"); const Frame Lexifier::Verb("v"); const Frame Lexifier::ED("ed"); const Frame Lexifier::ING("ing"); const Patrick_Path Attributes_Head_Pos("%(attributes head-pos)"); const Patrick_Path Attributes_All_Heads("%(attributes all-heads)"); const Patrick_Path Attributes_Sel("%(attributes sel)"); const Patrick_Path Snobbish("%(concepts class)"); const KString Lexifier::Special("SP"); const Concept_ID Numeric_Concept_ID("C", "DECIMAL-NUMBER"); const Patrick_Statement Is_Literal("(%(concepts contents) =c literal)"); const Symbol Meaningless("meaningless"); const Symbol Token_Start("token-start"); const Symbol Token_End("token-end"); ////////// // Computes the head of the idiom and the tokens involved. Fills idiom_head // with head value and returns subvector of tokens involved. Subvector< Token> tokens(const Definition& d, Iterator& idiom_head, const Vector< Token>& input, Iterator token_head) { const Symbol_Frame* words = Symbol_Frame::down_cast( d.slot_value( Idiom_Dictionary::Words).value()); Iterator length = words ? words->value().int_value() : 1; const Symbol_Frame* head = Symbol_Frame::down_cast( d.slot_value( Idiom_Dictionary::Head).value()); if (head) idiom_head = head->value().int_value(); else idiom_head = length - 1; Iterator start = token_head - idiom_head; return Subvector< Token>( input, start, length); } ////////// // This is all the possible forms of the verb 'to be'. Vector< KString> to_be_forms() { Vector< KString> result; result += KString("be"); result += KString("am"); result += KString("are"); result += KString("is"); result += KString("was"); result += KString("were"); result += KString("been"); result += KString("being"); result += KString("beings"); return result; } Vector< KString> To_Be_Forms = to_be_forms(); ////////// // Creates a literal definition from a token void add( Definition& d, const Subvector< Token>& tokens, const KString& head_root = KString::Default, Iterator head = 0, const Frame& hash_value = Undefined_Frame::Value) { ASSERT( tokens.length() && (tokens.length() >= head)); Iterator ti; // Add %tokens Frame token_frame; for (Iterator t = tokens.start(); t < tokens.end(); t++) token_frame.f_and( Frame( Symbol( KString_Format( t+1)))); d.unify( Definition::Tokens, token_frame); d.unify( Token_Start, Frame( Symbol( KString_Format( tokens.original_vector()[ tokens.start()].substring().start())))); d.unify( Token_End, Frame( Symbol( KString_Format( tokens.original_vector()[ tokens.end() - 1].substring().end())))); // Add %head-token if (tokens.length() > 1) d.unify( Lexifier::Head_Token, Frame( Symbol( KString_Format( head + tokens.start() + 1)))); // Add %head-root KString hr = (head_root != KString::Default) ? head_root : tokens.original_vector()[ head + tokens.start()].substring().string(); hr.capitalization( Capitalization::Lowercase); d.unify( Head_Root, Frame( Symbol( hr))); // Add %ortho const Vector< Token> original_tokens = tokens.original_vector(); Iterator start = original_tokens[ tokens.start()].substring().start(); Iterator end = original_tokens[ tokens.end() - 1].substring().end(); if (d.slot_value( Lexifier::Ortho)->undefined()) d.unify( Lexifier::Ortho, Frame( Symbol( Substring( original_tokens[0].substring().original_string(), start, end - start).string()))); // Add %root unless already defined KString root; const Symbol_Frame* rootf = Symbol_Frame::down_cast( d.slot_value( Lexifier::Root).value()); if (rootf) root = rootf->value(); else { for (ti = 0; ti < tokens.length(); ti++) { if (ti) root += ' '; if (ti == head && head_root.size()) root += head_root; else root += tokens.original_vector()[ tokens.start() + ti].substring().string(); } root.capitalization( Capitalization::Lowercase); d.unify( Lexifier::Root, Frame( Symbol( root))); } // Add %meaning unless already defined or literal if (d.slot_value( Definition::Meanings)->undefined() && head_root.size()) { Frame posf = d.slot_value( Definition::Category); const Symbol_Frame* possf = Symbol_Frame::down_cast( posf.value()); d.meaning( Meaning( possf ? possf->value() : Lexifier::Special, root, tokens.start() + head + 1)); } // Add (being +) if necessary if (To_Be_Forms.find( root) != Invalid_Iterator) d.unify( Lexifier::Being, Symbol::Plus); // Add %hash-value if (!(hash_value->undefined()) && d.slot_value( Lexifier::Hash_Value)->undefined()) d.unify( Lexifier::Hash_Value, hash_value); } ///////// // Unify idiom definition with word def of same category, if avilable Vector< Definition> unify_definitions(const Set< Definition>& idioms, Definition& words) { if (idioms.size() == 0) return Vector< Definition>(); // First clean up word definition bool snobbish = False; for (Iterator i = 0; i < idioms.size(); i++) { Frame tmp = idioms[i]; if (!(Snobbish.evaluate( tmp)->undefined())) { snobbish = True; break; } } static Patrick_Statement_List Prepare_Defs( "((%abbreviation = *REMOVE*)" " (%acronym = *REMOVE*)" " (%(concepts id) = *REMOVE*)" " (%(concepts smcs) = *REMOVE*)" " (%(concepts smcs-class) = *REMOVE*)" " (*OR* ((%pos = (*OR* unit n prop pron))" " (%pos <= (*OR* unit n prop pron)))" " ((%pos = (*NOT* (*OR* unit n prop pron))))))"); Prepare_Defs.evaluate( words); static Patrick_Statement_List Remove_Class( "((%(concepts class) = *REMOVE*)" " (%(concepts apcomp) = *REMOVE*)" " (%(concepts scomp) = *REMOVE*)" " (%(concepts xcomp) = *REMOVE*)" " (%(concepts valency) = *REMOVE*))"); if (snobbish) Remove_Class.evaluate( words); Patrick_Statement("(%kce = *REMOVE*)").evaluate( words); Vector< Definition> results; for (Iterator i3 = 0; i3 < idioms.size(); i3++) { Set_Frame* id = (Set_Frame*) Set_Frame::down_cast( idioms[i3].value()); Frame new_def; if (!id) new_def = idioms[i3]->unify( words); else for (Iterator i2 = 0; i2 < id->values().size(); i2++) { Frame pos_def = id->values()[i2]->unify( words); if (!(pos_def->fail())) new_def.f_or( pos_def); else new_def.f_or( id->values()[i2]); } if (!(new_def->fail())) results += Definition::down_cast( new_def); else results += idioms[i3]; } return results; } ////////// // Add alternate-cats fields void add_alternate_cats( Frame& defs) { Frame kce_defs( defs); if (Patrick_Statement("(%kce = +)").evaluate( kce_defs)->fail()) return; const Set_Frame* sdefs = Set_Frame::down_cast( defs.value()); if (!sdefs) return; Frame all_cats // must use path, should be *OR* = Patrick_Path( Vector< Symbol>( 1, Definition::Category)).evaluate( kce_defs); Frame temp = defs; Patrick_Statement_List Verb_ED( "((%past =c +)" " (%pred-passive = +)" " (%(concepts class subject) =c agent))"); bool ed = !(Verb_ED.evaluate( temp)->fail()); temp = defs; Patrick_Statement_List Verb_ING( "((%past = -)" " (%participle =c +)" " (%(concepts valency) =c intransitive))"); bool ing = !(Verb_ING.evaluate( temp)->fail()); if (ed || ing) all_cats.unify( (Lexifier::Verb)->f_not()); // Now assign the categories to each definition in the *OR* Frame new_defs; for (Iterator i = 0; i < sdefs->values().size(); i++) { Frame alt_cats; Frame new_def = sdefs->values()[i], defs( new_def); Frame this_cat = new_def.slot_value( Definition::Category); Frame new_alt_cats( all_cats); if (new_alt_cats.unify( this_cat.f_not())->fail()) new_alt_cats = Undefined_Frame::Value; temp = new_def; if (ed && Verb_ED.evaluate( temp)->fail()) new_alt_cats.f_or( Lexifier::ED); temp = new_def; if (ing && Verb_ING.evaluate( temp)->fail()) new_alt_cats.f_or(Lexifier::ING); if (!new_alt_cats->undefined()) new_def.unify( Lexifier::Alt_Cats, new_alt_cats); new_defs.f_or( new_def); } defs = Definition::down_cast( new_defs); } // This algorithm is specifically tailored to work with CAT's LE...because it // sometimes gets head_pos wrong (usually because the head-pos value is obsolete) // This algorithm should not apply to anything except CTE. // (It will not work unless a tag has the all-heads attribute set, which is only // done by the LE. Besides, we haven't yet needed structural tags in KCE anyway.) // Therefore, this algorithm is purely meant for CTE, not KCE void adjust_head_pos(Definition& tag, const Vector& tokens) { Frame all_heads = Attributes_All_Heads.evaluate(tag); if (all_heads->undefined()) return; const Symbol_Frame *heads = Symbol_Frame::down_cast(all_heads.value()); if (!heads) THROW(Data_Error, << "Invalid all-heads in tag:\n" << tag); Frame selection = Attributes_Sel.evaluate(tag); if (selection->undefined()) THROW(Data_Error, << "Missing sel in tag:\n" << tag); const Symbol_Frame *sel = Symbol_Frame::down_cast(selection.value()); int selval; if (!sel) THROW(Data_Error, << "Invalid sel in tag:\n" << tag); selval = sel->value().int_value(); KString heads_string = heads->value(); heads_string.dequote(); Vector heads_strings = heads_string.split(); if (heads_strings.size() % 2) THROW(Data_Error, << "Attribute all-heads wrong size:\n" << tag); Iterator n = heads_strings.size() >> 1; if (selval > n || selval < 1) THROW(Data_Error, << "Attribute sel is invalid:\n" << tag); int offset = heads_strings[(selval - 1) * 2].int_value() - 1; int current_offset = tokens[0].substring().start(); Iterator j; for (j = 0; current_offset < offset && j < tokens.size(); j++) { if (tokens[j].type() == Token_Type::Tag) if (tokens[j].substring().length() > 5) { KString tmp(tokens[j].substring().string(), 0, 5); tmp.capitalization(Capitalization::Uppercase); if (tmp != KString("& hashes) { const KString& token = input.value(); // if input is single char, use it. if ((token.size() == 1) && isalpha( token[0])) { output = Hash; hashes += token; return True; } // if input contains numbers, include them all bool number_flag = False; KString new_hash; for (Iterator i = 0; i < token.size(); i++) { if (isdigit( token[i])) { if (!number_flag) { number_flag = True; hashes += KString( 1, token[i]); new_hash += Hash.value()[1]; } else { hashes[ hashes.size() - 1] += token[i]; }} else { number_flag = False; new_hash += token[i]; }} output = Symbol( new_hash); return (input != output); } static Vector< KString> Default_Vector_KString; ////////// // The execute function. Will run the main algorithm described // above. Vector< Frame> Lexifier::execute(const Vector< Token>& input) const { Vector< Frame> results; Iterator i; // This vector will indicate what tokens need literal meanings on them. Vector< bool> literal_flags( input.size(), True); Frame literal_tag; // becomes a tagname when a literal tag appears Iterator literal_token_start; // The various strings of every input token. Vector< Symbol> input_symbols; for (i = 0; i < input.size(); i++) input_symbols += Symbol( input[i].substring().string()); // Construct hash info Vector< Symbol> hashed_symbols( input.size(), Symbol::Default); Vector< Vector< KString> > hashes( input.size(), Default_Vector_KString); bool hashable = False; for (i = 0; i < input.size(); i++) hashable = try_hashing( input_symbols[i], hashed_symbols[i], hashes[i]) || hashable; // Now analyze tokens for (i = 0; i < input.size(); i++) { Vector< Frame> idiom_results; Frame word_results; // Create trivial morpheme, or nontrivial one if token is word. KString is = input_symbols[i].value(); is.dequote(); Set< Morpheme> sm = morphonizer()->run( is); for (Iterator m = 0; m < sm.size(); m++) { Vector< Definition> idiom_defs; Definition word_defs, unifiable_word_defs; const KString& root = sm[m].root(); word_defs = wordifier()->run( root); // Call Morphsemizer on words if (!(word_defs->undefined() || word_defs->fail())) { unifiable_word_defs = word_defs; Description desc( word_defs, sm[m]); Definition final_defs = morphsemizer()->run( desc); Iterator head; Subvector< Token> words; if (!(final_defs->undefined() || final_defs->fail()) ) { Set_Frame* sf = Set_Frame::down_cast((Proto_Frame&) (final_defs.mutable_value())); const Definition& wr = sf ? Definition::down_cast( sf->values()[0]) : final_defs; words = tokens( wr, head, input, i); Subvector< Token> st( input, i, 1); if (sf) for (Iterator sfi = 0; sfi < sf->values().size(); sfi++) add( Definition::down_cast( sf->mutable_values()[sfi]), st, root); else add( final_defs, st, root); literal_flags[i] = False; word_results.f_or( final_defs); }} // Now search for idioms. Symbol old_idiom = input_symbols[i]; input_symbols[i] = Symbol( root); Idiom idiom( input_symbols, i); idiom_defs += unify_definitions( idiomifier()->run( idiom), unifiable_word_defs); input_symbols[i] = old_idiom; // If sentence contains hashable elements, try using hashed_symbols if (hashable) { old_idiom = hashed_symbols[i]; hashed_symbols[i] = Symbol( root); idiom = Idiom( hashed_symbols, i); idiom_defs += unify_definitions( idiomifier()->run( idiom), unifiable_word_defs); hashed_symbols[i] = old_idiom; } // Call Morphsemizer to validate/altar idiom definitions for (Iterator sdi = 0; sdi < idiom_defs.size(); sdi++) { Description desc = Description( idiom_defs[sdi], sm[m]); Definition altered_def = morphsemizer()->run( desc); if (altered_def->undefined() || altered_def->fail()) continue; // Figure out just how many tokens were used in the Definition. Set_Frame* sf = (Set_Frame*) Set_Frame::down_cast( altered_def.value()); const Definition& ad = sf ? Definition::down_cast( sf->values()[0]) : altered_def; Iterator head; Subvector< Token> words = tokens( ad, head, input, i); // Figure out hash frames Frame hash_frame; for (Iterator wi = words.start(); wi < words.end(); wi++) for (Iterator whi = 0; whi < hashes[wi].size(); whi++) hash_frame.f_and( Frame( Symbol( hashes[wi][whi]))); if (sf) { for (Iterator sfi = 0; sfi < sf->values().size(); sfi++) add( Definition::down_cast( sf->mutable_values()[sfi]), words, root, head, hash_frame); } else add( altered_def, words, root, head, hash_frame); smcs_checker()->validate( altered_def); idiom_results += altered_def; for (Iterator li = words.start(); li < words.end(); li++) literal_flags[li] = False; }} // Several special things to do with tags Frame wr( word_results); if (!(Is_Literal.evaluate( wr)->fail())) { const Frame& root = word_results.slot_value( Lexifier::Root); if (literal_tag->undefined()) { // start of literal sequence results += word_results; literal_tag = root; literal_token_start = i+1; } else if (literal_tag == root) { results += word_results; // don't add literal tokens if there are none if (literal_token_start < i) { Definition lit_def; Iterator start = input[ literal_token_start-1].substring().end(); lit_def.unify( Lexifier::Ortho, Frame( Symbol( Substring( input[0].substring().original_string(), start, input[i].substring().start() - start).string()))); add( lit_def, Subvector< Token>( input, literal_token_start, i - literal_token_start)); results += lit_def; } literal_tag = Undefined_Frame::Value; for (Iterator li = literal_token_start; li < i; li++) literal_flags[li] = False; continue; } } // don't analyze, part of literal if (literal_tag->undefined()) { if (!(word_results->undefined() || word_results->fail())) { add_alternate_cats( word_results); smcs_checker()->validate( word_results); adjust_head_pos( Definition::down_cast( word_results), input); results += word_results; } results += idiom_results; } else if (i == input.size() - 1 && literal_token_start < i) { // In case of rogue literal, add one def conprising rest of sentence Definition lit_def; Iterator start = input[ literal_token_start-1].substring().end(); lit_def.unify( Lexifier::Ortho, Frame( Symbol( Substring(input[0].substring().original_string(), start, input[i].substring().start() - start).string()))); add( lit_def, Subvector< Token>( input, literal_token_start, i - literal_token_start)); results += lit_def; literal_tag = Undefined_Frame::Value; for (Iterator li = literal_token_start; li < i; li++) literal_flags[li] = False; } } // Always create literal for final period. (If not done already) if (input.size() && (input_symbols[input.size()-1] == Period)) { Definition literal_def; add( literal_def, Subvector< Token>( input, input.size()-1, 1)); results += literal_def; literal_flags[ input.size() - 1] = False; } // Fill out Meaningless_Tokens. No single punctuation char may be meaningless. Iterator lfi; for (lfi = 0; lfi < input.size(); lfi++) { if (!literal_flags[lfi]) continue; Definition literal_def; add( literal_def, Subvector< Token>( input, lfi, 1)); if (literal_flags[lfi] && (input[lfi].type() != Token_Type::Punctuation) && (input[lfi].type() != Token_Type::Entity)) literal_def.unify( Symbol( Meaningless), Symbol::Plus); results += literal_def; } return results; }