/*----------------------------------------------------------
FILE
meaning.C
OVERVIEW
Lexifier program code
LOG
$Log: meaning.C,v $
Revision 1.82 2005/10/19 15:00:17 svoboda
Removed memtrays
Revision 1.81 2005/06/28 19:47:37 svoboda
Added token offsets to FS from lexifier
Revision 1.80 2005/02/24 18:59:19 svoboda
Several Windows portability changes s/string.H/kstring.H/g s/out/tout/
Revision 1.79 2003/04/30 19:26:49 svoboda
Now produces complete token sequences for S with rogue literal tag
Revision 1.78 2003/01/13 18:04:07 svoboda
Incorporated md5i's STD C++ compatability; now compilues under g++ 3.0
Revision 1.71.2.3 2002/11/14 21:27:55 md5i
Merge with trunk.
Revision 1.77 2002/10/15 20:29:27 svoboda
Adjustment to fix 507...kce-minus POSes not added to alternate-cats slot
Revision 1.76 2002/10/14 17:20:39 svoboda
Alternate-cats is now assigned to kce-minus terms
Revision 1.75 2002/10/07 21:06:18 svoboda
Fixed bug 488
Revision 1.74 2002/09/27 16:47:11 svoboda
Changed CTE to KCE for DMK & FS
Revision 1.73 2002/04/16 13:40:16 svoboda
Fixed 'hardened steel ball' regression - bug 393
Revision 1.72 2002/04/10 16:21:25 svoboda
Added diagnostics messages to Analyzer
Revision 1.71.2.2 2002/06/18 20:28:33 md5i
Merge in HEAD.
Revision 1.71.2.1 2002/03/19 18:08:43 md5i
Updates for STD C++.
Revision 1.71 2002/02/11 23:11:14 svoboda
Patrick_Statement_Lists must NOT be initialized at startup. AIX hates that
Revision 1.70 2002/02/11 20:17:40 svoboda
Oops, true is now a valid keyword...except on AIX
Revision 1.69 2001/11/29 21:29:07 svoboda
Added head-root slot, as per Bug 290
Revision 1.68 2001/10/22 20:39:45 svoboda
OK, overhauled unify_defs's initialization routines to be consistent
Revision 1.67 2001/10/19 21:38:22 svoboda
Bug 279 fix...don't unify idioms with words if no idioms
Revision 1.66 2001/10/16 18:09:15 svoboda
Bug 269 - unify_defs unifies class iff Idiom class not defined
Revision 1.65 2001/10/15 19:04:38 svoboda
unify-defs once again unifies semantic verb features (Bug 262)
Revision 1.64 2001/07/05 16:14:33 svoboda
Idiom-word inheritance no longer inherits scomp, xcomp, or apcomp (fix for Bug 156)
Revision 1.63 2000/11/03 20:35:01 svoboda
Adopted simpler output format for ambiguities under KCE. CTE outputs unchanged
Revision 1.62 2000/10/26 18:30:59 svoboda
Changed CTE to KCE for DMK slot and for disambig tags
Revision 1.61 2000/09/19 13:36:54 svoboda
Bugfix...final period may still also be part of an abbrev
Revision 1.60 2000/09/05 14:12:22 svoboda
Character entities may no longer be considered meaningless tokens
Revision 1.59 2000/08/29 16:17:54 svoboda
Bugfix, single punct chars may no longer be Meaningless_Tokens
Revision 1.58 2000/04/14 12:57:25 svoboda
Definition-unification now ignores abbreviations & acronyms
Revision 1.57 2000/03/14 12:09:37 svoboda
Analyzer now reports meaningless tokens
Revision 1.56 1999/12/03 13:58:33 svoboda
KANTOO now produces log output formatted as outlines
Revision 1.55 1999/10/07 13:33:48 svoboda
SMCS codes no longer included when unifying words & idioms
Revision 1.54 1999/09/29 17:14:53 svoboda
Bugfix...pass strings to morphonizer dequoted
Revision 1.53 1999/09/28 14:30:54 svoboda
Quick bugfix to hash wildcard feature
Revision 1.52 1999/09/28 13:39:03 svoboda
PR 5991 - added wildcard 'hash' symbols
Revision 1.51 1999/07/27 14:03:39 svoboda
Unify-definitions no longer prunes out idioms if their head has no matching POS
Revision 1.50 1999/07/19 12:09:38 svoboda
Oops, literals were still getting added
Revision 1.49 1999/07/16 12:30:30 svoboda
Eliminated numberizer/tagifier & fixed some memory leaks
Revision 1.48 1999/07/08 18:59:53 svoboda
Removed Numberizer and Tagifier...made obsolete by more powerful morphology
Revision 1.47 1999/07/03 12:02:25 svoboda
Moved is_int, etc. from Symbol_Frame to KString...where they belong
Revision 1.46 1999/07/02 18:59:23 svoboda
Symbol_Frames now replace Atomic_Frames, Boolean_Frames, and Integer_Frames
Revision 1.45 1999/06/17 14:29:52 svoboda
Symbols are now implemented with Frames, and like Frames
Revision 1.44 1999/05/11 12:53:27 md5i
Changes to allow symbols to be created staticly despite memtrays.
Revision 1.43 1999/03/22 13:04:21 md5i
Fencepost error fixed.
Revision 1.42 1999/03/19 17:12:59 md5i
Use all-heads to fix head-pos values in ?CTE attach tags.
Revision 1.41 1999/03/16 15:21:34 md5i
Memory leak fix.
Revision 1.40 1999/03/15 16:11:25 md5i
Memory fixes.
Revision 1.39 1999/03/01 16:12:29 svoboda
Added SMCS Codes
Revision 1.38 1999/01/29 10:45:49 svoboda
Made zero-numbering optional for units
Revision 1.37 1998/12/16 18:03:03 svoboda
Made fuzzy head-pos go both ways
Revision 1.36 1998/12/10 18:01:44 svoboda
Added disgusting fuzzy_head_pos hack in to Lexifier
Revision 1.35 1998/11/23 09:39:23 md5i
Merged Memory_Trays into main distribution.
Revision 1.34 1998/11/12 17:29:11 svoboda
Bugfixes: missing-bug, empty-literal-tags bug
Revision 1.33 1998/11/06 17:41:43 svoboda
DMK_Translator now points out invalid symbols in DMK
Revision 1.32 1998/11/02 15:46:21 svoboda
Syntaxifier now takes a graph of inputs, not a sequence
Revision 1.31.2.1 1998/11/13 17:45:18 md5i
Added memtrays.
Revision 1.31 1998/10/27 17:42:47 svoboda
OK got unify_defs working right now.
Revision 1.30 1998/10/26 18:59:00 svoboda
Slight bug in unify-defs; allow non-noun idioms to unify correctly
Revision 1.29 1998/10/13 19:32:02 svoboda
Some bugfixes wrt idiom/word unification
Revision 1.28 1998/10/08 18:26:09 svoboda
Oops...numbers were getting wrongly identified in phrases
Revision 1.27 1998/10/08 14:23:28 svoboda
Removed (cte -) from words when unifying with idioms
Revision 1.26 1998/10/07 16:59:28 svoboda
Fixed bug wrt idiom meanings/orthos/roots being truncated to head
Revision 1.25 1998/10/06 16:39:23 svoboda
Idioms now are constructed from symbols, not strings
Revision 1.24 1998/09/30 13:57:25 svoboda
Bugfixes in definition unification
* Revision 1.23 98/09/23 15:15:28 svoboda
* Fixed literals bug
*
Revision 1.22 1998/09/21 15:08:36 svoboda
Added numeric-value to lexifier for idioms
* Revision 1.21 98/09/11 12:36:52 md5i
* Symbols are now based on Memvector_KStrings.
*
Revision 1.20 1998/09/04 14:38:32 svoboda
Converted Lexifier to speak PATRICK
Revision 1.19 1998/08/03 16:37:12 svoboda
Fixed alternate-cats to ignore phrases and non-CTE words
Revision 1.18 1998/07/27 15:24:14 svoboda
Adjusted ED/ING to be first-class alternate-cats
Revision 1.17 1998/07/22 16:06:34 svoboda
Added alternate-cats ing and ed
Revision 1.16 1998/06/24 15:44:22 svoboda
Defs unify if typeids match, not cats
Revision 1.15 1998/06/23 17:16:14 svoboda
Added alternate-cats slot
Revision 1.14 1998/06/22 18:26:34 svoboda
The unify-defs procedure handles several more features now
Revision 1.13 1998/06/19 14:03:59 svoboda
Many bugfixes, incl. idiom inheritance, separate Defs, / & ' handling
Revision 1.12 1998/06/02 11:59:17 svoboda
Added install.data for language-related files to install.
Revision 1.11 1998/05/27 13:33:10 svoboda
Plugged memory leaks
Revision 1.10 1998/02/26 13:20:01 md5i
Shared Library name no longer hard codes KANTOO environ.
Revision 1.9 1998/01/13 10:52:48 md5i
Numbers are now Noun_Definitions, as opposed to Real Definitions.
Revision 1.8 1997/11/26 12:51:00 svoboda
Made partial DMK image optimization
* Revision 1.7 97/09/09 16:16:42 md5i
* More hacking to get things working under g++ with optimization.
*
Revision 1.6 1997/08/22 16:08:44 svoboda
Fixed bug of interpreting '.' as a number
Revision 1.5 1997/08/01 15:59:38 svoboda
Created Shared_Library class, extended Rule_Database to use it, improved logs
* Revision 1.4 97/06/05 19:18:06 svoboda
* Redesigned unification, adding paths/slots/typeless grammar
*
* Revision 1.3 97/06/01 16:34:03 svoboda
* Added symbol table class
*
* Revision 1.2 97/05/20 16:28:19 svoboda
* Added protocol class, made modules less interdependent
*
* Revision 1.1 97/05/13 13:22:04 svoboda
* Added Lexifier
*
AUTHOR
David Svoboda
Copyright (C) 1997
Carnegie Mellon
All Rights Reserved
----------------------------------------------------------*/
#include "dmk_database.H"
#include "deftypes.H"
#include "lexifier.H"
#include "morphonizer.H"
#include "morphsemizer.H"
#include "wordifier.H"
#include "idiomifier.H"
#include
#include "capitalization.H"
#include "idiom_dictary.H"
#include "sequence_frame.H"
#include "slot_frame.H"
#include "patrick_statements.H"
#include "smcs.H"
const Symbol Lexifier::Root("root");
const Symbol Lexifier::Ortho("ortho");
const Symbol Lexifier::Head_Token("head-token");
const Symbol Head_Root("head-root");
const Symbol Lexifier::Attributes("attributes");
const Symbol Lexifier::Being("being");
const Symbol Lexifier::Alt_Cats("alternate-cats");
const Symbol Lexifier::Hash_Value("hash");
const Frame Lexifier::Number_Category("number");
const Frame Lexifier::Verb("v");
const Frame Lexifier::ED("ed");
const Frame Lexifier::ING("ing");
const Patrick_Path Attributes_Head_Pos("%(attributes head-pos)");
const Patrick_Path Attributes_All_Heads("%(attributes all-heads)");
const Patrick_Path Attributes_Sel("%(attributes sel)");
const Patrick_Path Snobbish("%(concepts class)");
const KString Lexifier::Special("SP");
const Concept_ID Numeric_Concept_ID("C", "DECIMAL-NUMBER");
const Patrick_Statement Is_Literal("(%(concepts contents) =c literal)");
const Symbol Meaningless("meaningless");
const Symbol Token_Start("token-start");
const Symbol Token_End("token-end");
//////////
// Computes the head of the idiom and the tokens involved. Fills idiom_head
// with head value and returns subvector of tokens involved.
Subvector< Token> tokens(const Definition& d, Iterator& idiom_head,
const Vector< Token>& input, Iterator token_head) {
const Symbol_Frame* words
= Symbol_Frame::down_cast( d.slot_value( Idiom_Dictionary::Words).value());
Iterator length = words ? words->value().int_value() : 1;
const Symbol_Frame* head
= Symbol_Frame::down_cast( d.slot_value( Idiom_Dictionary::Head).value());
if (head) idiom_head = head->value().int_value();
else idiom_head = length - 1;
Iterator start = token_head - idiom_head;
return Subvector< Token>( input, start, length);
}
//////////
// This is all the possible forms of the verb 'to be'.
Vector< KString> to_be_forms() {
Vector< KString> result;
result += KString("be");
result += KString("am");
result += KString("are");
result += KString("is");
result += KString("was");
result += KString("were");
result += KString("been");
result += KString("being");
result += KString("beings");
return result;
}
Vector< KString> To_Be_Forms = to_be_forms();
//////////
// Creates a literal definition from a token
void add( Definition& d, const Subvector< Token>& tokens,
const KString& head_root = KString::Default, Iterator head = 0,
const Frame& hash_value = Undefined_Frame::Value) {
ASSERT( tokens.length() && (tokens.length() >= head));
Iterator ti;
// Add %tokens
Frame token_frame;
for (Iterator t = tokens.start(); t < tokens.end(); t++)
token_frame.f_and( Frame( Symbol( KString_Format( t+1))));
d.unify( Definition::Tokens, token_frame);
d.unify( Token_Start, Frame( Symbol( KString_Format( tokens.original_vector()[ tokens.start()].substring().start()))));
d.unify( Token_End, Frame( Symbol( KString_Format( tokens.original_vector()[ tokens.end() - 1].substring().end()))));
// Add %head-token
if (tokens.length() > 1)
d.unify( Lexifier::Head_Token,
Frame( Symbol( KString_Format( head + tokens.start() + 1))));
// Add %head-root
KString hr = (head_root != KString::Default) ? head_root :
tokens.original_vector()[ head + tokens.start()].substring().string();
hr.capitalization( Capitalization::Lowercase);
d.unify( Head_Root, Frame( Symbol( hr)));
// Add %ortho
const Vector< Token> original_tokens = tokens.original_vector();
Iterator start = original_tokens[ tokens.start()].substring().start();
Iterator end = original_tokens[ tokens.end() - 1].substring().end();
if (d.slot_value( Lexifier::Ortho)->undefined())
d.unify( Lexifier::Ortho, Frame( Symbol(
Substring( original_tokens[0].substring().original_string(),
start, end - start).string())));
// Add %root unless already defined
KString root;
const Symbol_Frame* rootf
= Symbol_Frame::down_cast( d.slot_value( Lexifier::Root).value());
if (rootf) root = rootf->value();
else {
for (ti = 0; ti < tokens.length(); ti++) {
if (ti) root += ' ';
if (ti == head && head_root.size()) root += head_root;
else root += tokens.original_vector()[ tokens.start() + ti].substring().string();
}
root.capitalization( Capitalization::Lowercase);
d.unify( Lexifier::Root, Frame( Symbol( root)));
}
// Add %meaning unless already defined or literal
if (d.slot_value( Definition::Meanings)->undefined() && head_root.size()) {
Frame posf = d.slot_value( Definition::Category);
const Symbol_Frame* possf = Symbol_Frame::down_cast( posf.value());
d.meaning( Meaning( possf ? possf->value() : Lexifier::Special,
root, tokens.start() + head + 1));
}
// Add (being +) if necessary
if (To_Be_Forms.find( root) != Invalid_Iterator)
d.unify( Lexifier::Being, Symbol::Plus);
// Add %hash-value
if (!(hash_value->undefined()) && d.slot_value( Lexifier::Hash_Value)->undefined())
d.unify( Lexifier::Hash_Value, hash_value);
}
/////////
// Unify idiom definition with word def of same category, if avilable
Vector< Definition> unify_definitions(const Set< Definition>& idioms,
Definition& words) {
if (idioms.size() == 0) return Vector< Definition>();
// First clean up word definition
bool snobbish = False;
for (Iterator i = 0; i < idioms.size(); i++) {
Frame tmp = idioms[i];
if (!(Snobbish.evaluate( tmp)->undefined())) {
snobbish = True;
break;
}
}
static Patrick_Statement_List Prepare_Defs(
"((%abbreviation = *REMOVE*)"
" (%acronym = *REMOVE*)"
" (%(concepts id) = *REMOVE*)"
" (%(concepts smcs) = *REMOVE*)"
" (%(concepts smcs-class) = *REMOVE*)"
" (*OR* ((%pos = (*OR* unit n prop pron))"
" (%pos <= (*OR* unit n prop pron)))"
" ((%pos = (*NOT* (*OR* unit n prop pron))))))");
Prepare_Defs.evaluate( words);
static Patrick_Statement_List Remove_Class(
"((%(concepts class) = *REMOVE*)"
" (%(concepts apcomp) = *REMOVE*)"
" (%(concepts scomp) = *REMOVE*)"
" (%(concepts xcomp) = *REMOVE*)"
" (%(concepts valency) = *REMOVE*))");
if (snobbish) Remove_Class.evaluate( words);
Patrick_Statement("(%kce = *REMOVE*)").evaluate( words);
Vector< Definition> results;
for (Iterator i3 = 0; i3 < idioms.size(); i3++) {
Set_Frame* id = (Set_Frame*) Set_Frame::down_cast( idioms[i3].value());
Frame new_def;
if (!id) new_def = idioms[i3]->unify( words);
else
for (Iterator i2 = 0; i2 < id->values().size(); i2++) {
Frame pos_def = id->values()[i2]->unify( words);
if (!(pos_def->fail())) new_def.f_or( pos_def);
else new_def.f_or( id->values()[i2]);
}
if (!(new_def->fail())) results += Definition::down_cast( new_def);
else results += idioms[i3];
}
return results;
}
//////////
// Add alternate-cats fields
void add_alternate_cats( Frame& defs) {
Frame kce_defs( defs);
if (Patrick_Statement("(%kce = +)").evaluate( kce_defs)->fail()) return;
const Set_Frame* sdefs = Set_Frame::down_cast( defs.value());
if (!sdefs) return;
Frame all_cats // must use path, should be *OR*
= Patrick_Path( Vector< Symbol>( 1, Definition::Category)).evaluate( kce_defs);
Frame temp = defs;
Patrick_Statement_List Verb_ED(
"((%past =c +)"
" (%pred-passive = +)"
" (%(concepts class subject) =c agent))");
bool ed = !(Verb_ED.evaluate( temp)->fail());
temp = defs;
Patrick_Statement_List Verb_ING(
"((%past = -)"
" (%participle =c +)"
" (%(concepts valency) =c intransitive))");
bool ing = !(Verb_ING.evaluate( temp)->fail());
if (ed || ing) all_cats.unify( (Lexifier::Verb)->f_not());
// Now assign the categories to each definition in the *OR*
Frame new_defs;
for (Iterator i = 0; i < sdefs->values().size(); i++) {
Frame alt_cats;
Frame new_def = sdefs->values()[i], defs( new_def);
Frame this_cat = new_def.slot_value( Definition::Category);
Frame new_alt_cats( all_cats);
if (new_alt_cats.unify( this_cat.f_not())->fail())
new_alt_cats = Undefined_Frame::Value;
temp = new_def;
if (ed && Verb_ED.evaluate( temp)->fail())
new_alt_cats.f_or( Lexifier::ED);
temp = new_def;
if (ing && Verb_ING.evaluate( temp)->fail())
new_alt_cats.f_or(Lexifier::ING);
if (!new_alt_cats->undefined()) new_def.unify( Lexifier::Alt_Cats,
new_alt_cats);
new_defs.f_or( new_def);
}
defs = Definition::down_cast( new_defs);
}
// This algorithm is specifically tailored to work with CAT's LE...because it
// sometimes gets head_pos wrong (usually because the head-pos value is obsolete)
// This algorithm should not apply to anything except CTE.
// (It will not work unless a tag has the all-heads attribute set, which is only
// done by the LE. Besides, we haven't yet needed structural tags in KCE anyway.)
// Therefore, this algorithm is purely meant for CTE, not KCE
void adjust_head_pos(Definition& tag, const Vector& tokens) {
Frame all_heads = Attributes_All_Heads.evaluate(tag);
if (all_heads->undefined()) return;
const Symbol_Frame *heads = Symbol_Frame::down_cast(all_heads.value());
if (!heads) THROW(Data_Error, << "Invalid all-heads in tag:\n" << tag);
Frame selection = Attributes_Sel.evaluate(tag);
if (selection->undefined())
THROW(Data_Error, << "Missing sel in tag:\n" << tag);
const Symbol_Frame *sel = Symbol_Frame::down_cast(selection.value());
int selval;
if (!sel) THROW(Data_Error, << "Invalid sel in tag:\n" << tag);
selval = sel->value().int_value();
KString heads_string = heads->value();
heads_string.dequote();
Vector heads_strings = heads_string.split();
if (heads_strings.size() % 2)
THROW(Data_Error, << "Attribute all-heads wrong size:\n" << tag);
Iterator n = heads_strings.size() >> 1;
if (selval > n || selval < 1)
THROW(Data_Error, << "Attribute sel is invalid:\n" << tag);
int offset = heads_strings[(selval - 1) * 2].int_value() - 1;
int current_offset = tokens[0].substring().start();
Iterator j;
for (j = 0; current_offset < offset && j < tokens.size(); j++) {
if (tokens[j].type() == Token_Type::Tag)
if (tokens[j].substring().length() > 5) {
KString tmp(tokens[j].substring().string(), 0, 5);
tmp.capitalization(Capitalization::Uppercase);
if (tmp != KString("& hashes) {
const KString& token = input.value();
// if input is single char, use it.
if ((token.size() == 1) && isalpha( token[0])) {
output = Hash;
hashes += token;
return True;
}
// if input contains numbers, include them all
bool number_flag = False;
KString new_hash;
for (Iterator i = 0; i < token.size(); i++) {
if (isdigit( token[i])) {
if (!number_flag) {
number_flag = True;
hashes += KString( 1, token[i]);
new_hash += Hash.value()[1];
} else {
hashes[ hashes.size() - 1] += token[i];
}} else {
number_flag = False;
new_hash += token[i];
}}
output = Symbol( new_hash);
return (input != output);
}
static Vector< KString> Default_Vector_KString;
//////////
// The execute function. Will run the main algorithm described
// above.
Vector< Frame> Lexifier::execute(const Vector< Token>& input) const {
Vector< Frame> results;
Iterator i;
// This vector will indicate what tokens need literal meanings on them.
Vector< bool> literal_flags( input.size(), True);
Frame literal_tag; // becomes a tagname when a literal tag appears
Iterator literal_token_start;
// The various strings of every input token.
Vector< Symbol> input_symbols;
for (i = 0; i < input.size(); i++)
input_symbols += Symbol( input[i].substring().string());
// Construct hash info
Vector< Symbol> hashed_symbols( input.size(), Symbol::Default);
Vector< Vector< KString> > hashes( input.size(), Default_Vector_KString);
bool hashable = False;
for (i = 0; i < input.size(); i++)
hashable = try_hashing( input_symbols[i], hashed_symbols[i], hashes[i]) || hashable;
// Now analyze tokens
for (i = 0; i < input.size(); i++) {
Vector< Frame> idiom_results;
Frame word_results;
// Create trivial morpheme, or nontrivial one if token is word.
KString is = input_symbols[i].value();
is.dequote();
Set< Morpheme> sm = morphonizer()->run( is);
for (Iterator m = 0; m < sm.size(); m++) {
Vector< Definition> idiom_defs;
Definition word_defs, unifiable_word_defs;
const KString& root = sm[m].root();
word_defs = wordifier()->run( root);
// Call Morphsemizer on words
if (!(word_defs->undefined() || word_defs->fail())) {
unifiable_word_defs = word_defs;
Description desc( word_defs, sm[m]);
Definition final_defs = morphsemizer()->run( desc);
Iterator head;
Subvector< Token> words;
if (!(final_defs->undefined() || final_defs->fail()) ) {
Set_Frame* sf = Set_Frame::down_cast((Proto_Frame&) (final_defs.mutable_value()));
const Definition& wr = sf ? Definition::down_cast( sf->values()[0]) : final_defs;
words = tokens( wr, head, input, i);
Subvector< Token> st( input, i, 1);
if (sf) for (Iterator sfi = 0; sfi < sf->values().size(); sfi++)
add( Definition::down_cast( sf->mutable_values()[sfi]), st, root);
else add( final_defs, st, root);
literal_flags[i] = False;
word_results.f_or( final_defs);
}}
// Now search for idioms.
Symbol old_idiom = input_symbols[i];
input_symbols[i] = Symbol( root);
Idiom idiom( input_symbols, i);
idiom_defs += unify_definitions( idiomifier()->run( idiom),
unifiable_word_defs);
input_symbols[i] = old_idiom;
// If sentence contains hashable elements, try using hashed_symbols
if (hashable) {
old_idiom = hashed_symbols[i];
hashed_symbols[i] = Symbol( root);
idiom = Idiom( hashed_symbols, i);
idiom_defs += unify_definitions( idiomifier()->run( idiom),
unifiable_word_defs);
hashed_symbols[i] = old_idiom;
}
// Call Morphsemizer to validate/altar idiom definitions
for (Iterator sdi = 0; sdi < idiom_defs.size(); sdi++) {
Description desc = Description( idiom_defs[sdi], sm[m]);
Definition altered_def = morphsemizer()->run( desc);
if (altered_def->undefined() || altered_def->fail()) continue;
// Figure out just how many tokens were used in the Definition.
Set_Frame* sf = (Set_Frame*) Set_Frame::down_cast( altered_def.value());
const Definition& ad
= sf ? Definition::down_cast( sf->values()[0]) : altered_def;
Iterator head;
Subvector< Token> words = tokens( ad, head, input, i);
// Figure out hash frames
Frame hash_frame;
for (Iterator wi = words.start(); wi < words.end(); wi++)
for (Iterator whi = 0; whi < hashes[wi].size(); whi++)
hash_frame.f_and( Frame( Symbol( hashes[wi][whi])));
if (sf) {
for (Iterator sfi = 0; sfi < sf->values().size(); sfi++)
add( Definition::down_cast( sf->mutable_values()[sfi]), words, root, head, hash_frame);
} else add( altered_def, words, root, head, hash_frame);
smcs_checker()->validate( altered_def);
idiom_results += altered_def;
for (Iterator li = words.start(); li < words.end(); li++)
literal_flags[li] = False;
}}
// Several special things to do with tags
Frame wr( word_results);
if (!(Is_Literal.evaluate( wr)->fail())) {
const Frame& root = word_results.slot_value( Lexifier::Root);
if (literal_tag->undefined()) { // start of literal sequence
results += word_results;
literal_tag = root;
literal_token_start = i+1;
} else if (literal_tag == root) {
results += word_results;
// don't add literal tokens if there are none
if (literal_token_start < i) {
Definition lit_def;
Iterator start = input[ literal_token_start-1].substring().end();
lit_def.unify( Lexifier::Ortho, Frame( Symbol(
Substring( input[0].substring().original_string(),
start, input[i].substring().start() - start).string())));
add( lit_def, Subvector< Token>( input, literal_token_start,
i - literal_token_start));
results += lit_def;
}
literal_tag = Undefined_Frame::Value;
for (Iterator li = literal_token_start; li < i; li++)
literal_flags[li] = False;
continue;
}
}
// don't analyze, part of literal
if (literal_tag->undefined()) {
if (!(word_results->undefined() || word_results->fail())) {
add_alternate_cats( word_results);
smcs_checker()->validate( word_results);
adjust_head_pos( Definition::down_cast( word_results), input);
results += word_results;
}
results += idiom_results;
} else if (i == input.size() - 1 && literal_token_start < i) {
// In case of rogue literal, add one def conprising rest of sentence
Definition lit_def;
Iterator start = input[ literal_token_start-1].substring().end();
lit_def.unify( Lexifier::Ortho, Frame( Symbol(
Substring(input[0].substring().original_string(),
start, input[i].substring().start() - start).string())));
add( lit_def, Subvector< Token>( input, literal_token_start,
i - literal_token_start));
results += lit_def;
literal_tag = Undefined_Frame::Value;
for (Iterator li = literal_token_start; li < i; li++)
literal_flags[li] = False;
}
}
// Always create literal for final period. (If not done already)
if (input.size() && (input_symbols[input.size()-1] == Period)) {
Definition literal_def;
add( literal_def, Subvector< Token>( input, input.size()-1, 1));
results += literal_def;
literal_flags[ input.size() - 1] = False;
}
// Fill out Meaningless_Tokens. No single punctuation char may be meaningless.
Iterator lfi;
for (lfi = 0; lfi < input.size(); lfi++) {
if (!literal_flags[lfi]) continue;
Definition literal_def;
add( literal_def, Subvector< Token>( input, lfi, 1));
if (literal_flags[lfi] && (input[lfi].type() != Token_Type::Punctuation)
&& (input[lfi].type() != Token_Type::Entity))
literal_def.unify( Symbol( Meaningless), Symbol::Plus);
results += literal_def;
}
return results;
}