1% This file is part of the Attempto Parsing Engine (APE).
    2% Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch).
    3%
    4% The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it
    5% under the terms of the GNU Lesser General Public License as published by the Free Software
    6% Foundation, either version 3 of the License, or (at your option) any later version.
    7%
    8% The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT
    9% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
   10% PURPOSE. See the GNU Lesser General Public License for more details.
   11%
   12% You should have received a copy of the GNU Lesser General Public License along with the Attempto
   13% Parsing Engine (APE). If not, see http://www.gnu.org/licenses/.
   14
   15
   16:- module(ape_utils, [
   17		cpu_time/2,
   18		handle_unknown_words/4,
   19		new_npid/1
   20	]).

APE utils

author
- Kaarel Kaljurand
- Tobias Kuhn
version
- 2009-11-18

*/

   31:- use_module(grammar_words).   32:- use_module('../lexicon/spellcheck').   33:- use_module('../lexicon/is_in_lexicon').   34:- use_module('../lexicon/functionwords').   35:- use_module('../lexicon/chars', [is_capitalized/1]).   36:- use_module('../logger/error_logger', [
   37		add_error_messagelist/4,
   38		add_error_message/4
   39	]).   40
   41:- use_module(sentence_failure, [
   42		get_all_sentence_errors/2
   43	]).   44
   45:- use_module('../utils/ace_niceace', [
   46		ace_niceace/2
   47	]).   48
   49
   50:- op(400, fy, -).   51:- op(400, fy, ~).   52:- op(500, xfx, =>).   53:- op(500, xfx, v).   54
   55%:- debug(known_word).
   56
   57cpu_time(Goal, Duration) :-
   58	statistics(runtime, [Start | _]),
   59	ignore(Goal),
   60	statistics(runtime, [Finish | _]),
   61	Duration is (Finish-Start) * 0.001.
   62
   63% @deprecated
   64read_text_from_commandline([C|R]) :-
   65	get0(C),
   66	line_continues(C),
   67	!,
   68	read_text_from_commandline(R).
   69
   70read_text_from_commandline([]).
   71
   72% @deprecated
   73line_continues(C) :- C \== 10, C \== -1, !.
 list_close(+OpenList, -ClosedList) is det
Closes an open list.
   80list_close([], []) :- !.
   81
   82list_close([Head | Tail], [Head | ClosedTail]) :-
   83	list_close(Tail, ClosedTail).
 member_of_domain(+Element:var, +List:list) is det
member/2 for variables
bug
- we should not catch wellformedness bugs here, but in the beginning. (Dom must always be a closed list)

Note: Can Dom be open during parsing? (member_of_domain/2 is called from add_modifiers)

   96member_of_domain(E, [F | _]) :-
   97	E==F,
   98	!.
   99
  100member_of_domain(E, [_ | R]) :-
  101	nonvar(R),  % catches a bug
  102	member_of_domain(E,R).
 list_of_conds_and_anaphors(+List, -List, -AnaphorAnteList)
Filters out functors `antecedent' and `anaphor' and closes the list. Copies all `anaphor' and `antecedent' functors into the extra argument.

We have to do it recursively for the following tests to work.

bug
- Type-argument is currently ignored.
  122list_of_conds_and_anaphors(Tail, [], []) :- var(Tail), !.
  123
  124list_of_conds_and_anaphors([], [], []).
  125
  126list_of_conds_and_anaphors([Head | Tail], CondsTail, [Head|AnaAnte]) :-
  127	functor(Head, antecedent, _),
  128	!,
  129	list_of_conds_and_anaphors(Tail, CondsTail, AnaAnte).
  130
  131list_of_conds_and_anaphors([Head | Tail], CondsTail, [Head|AnaAnte]) :-
  132	Head = anaphor(_, _, _, AnaphorConds, _, _, _, _, _, _, _),
  133	!,
  134	list_of_conds_and_anaphors(Tail, CondsTailTemp1, AnaAnte),
  135	append(AnaphorConds, CondsTailTemp1, CondsTailTemp2),
  136	sort(CondsTailTemp2, CondsTail).
  137
  138list_of_conds_and_anaphors([Condition | CondsTail], [ConditionPruned | CondsTailPruned], AnaAnte) :-
  139	handle_condition(Condition, ConditionPruned),
  140	!,
  141	list_of_conds_and_anaphors(CondsTail, CondsTailPruned, AnaAnte).
  142
  143% Simple condition.
  144list_of_conds_and_anaphors([Head | Tail], [Head | CondsTail], AnaAnte) :-
  145	list_of_conds_and_anaphors(Tail, CondsTail, AnaAnte).
 handle_condition(+Condition, -ConditionPruned)
  152handle_condition([First|Rest], CondsPruned) :-
  153	list_of_conds_and_anaphors([First|Rest], CondsPruned, _).
  154
  155handle_condition(-drs(Dom, Conds), -drs(Dom, CondsPruned)) :-
  156	list_of_conds_and_anaphors(Conds, CondsPruned, _).
  157
  158handle_condition(Label:drs(Dom, Conds), Label:drs(Dom, CondsPruned)) :-
  159	list_of_conds_and_anaphors(Conds, CondsPruned, _).
  160
  161handle_condition(can(drs(Dom, Conds)), can(drs(Dom, CondsPruned))) :-
  162	list_of_conds_and_anaphors(Conds, CondsPruned, _).
  163
  164handle_condition(must(drs(Dom, Conds)), must(drs(Dom, CondsPruned))) :-
  165	list_of_conds_and_anaphors(Conds, CondsPruned, _).
  166
  167handle_condition(should(drs(Dom, Conds)), should(drs(Dom, CondsPruned))) :-
  168	list_of_conds_and_anaphors(Conds, CondsPruned, _).
  169
  170handle_condition(may(drs(Dom, Conds)), may(drs(Dom, CondsPruned))) :-
  171	list_of_conds_and_anaphors(Conds, CondsPruned, _).
  172
  173handle_condition(~drs(Dom, Conds), ~drs(Dom, CondsPruned)) :-
  174	list_of_conds_and_anaphors(Conds, CondsPruned, _).
  175
  176handle_condition(drs(Dom1, Conds1) => drs(Dom2, Conds2), drs(Dom1, Conds1Pruned) => drs(Dom2, Conds2Pruned)) :-
  177	list_of_conds_and_anaphors(Conds1, Conds1Pruned, _),
  178	list_of_conds_and_anaphors(Conds2, Conds2Pruned, _).
  179
  180handle_condition(drs(Dom1, Conds1) v drs(Dom2, Conds2), drs(Dom1, Conds1Pruned) v drs(Dom2, Conds2Pruned)) :-
  181	list_of_conds_and_anaphors(Conds1, Conds1Pruned, _),
  182	list_of_conds_and_anaphors(Conds2, Conds2Pruned, _).
 report_failed_sentence(+SentenceID:number, +FirstUnparsedSentence:list) is det
Arguments:
SentenceID- is a sentence ID
UnparsedSentences- is a list of (remaining) tokens that were not accepted
bug
- We also return the token ID that identifies the first unparsed token. This ID is misleading though as it references a token in the internal token list which contains things like '^', '-thing', 'n', ':', etc. Thus the number can point to a location further than the actual location perceived by the user. (We do subtract 1 to account for the '^' symbol present in every token list, i.e. otherwise the formula would be: TokenID is (AllTokensNumber - UnparsedTokensNumber) + 1).
  197report_failed_sentence(SentenceID, FirstSentence) :-
  198	get_unparsed_tokens_number(UnparsedTokensNumber),
  199	length(UnparsedTokens, UnparsedTokensNumber),
  200	append(ParsedTokens, UnparsedTokens, FirstSentence),
  201	append(ParsedTokens, ['<>'|UnparsedTokens], FirstSentenceX),
  202	get_all_sentence_errors(FirstSentenceX, ErrorTextList),
  203	ace_niceace(FirstSentenceX, FirstSentenceNice),
  204	concat_atom(FirstSentenceNice, ' ', FirstSentenceAtom),
  205	length(FirstSentence, AllTokensNumber),
  206	TokenID is (AllTokensNumber - UnparsedTokensNumber),
  207	add_error_messagelist(sentence, SentenceID-TokenID, FirstSentenceAtom, ErrorTextList).
 spellcheck_and_report_unknown_words(+UnknownTokenList)
Arguments:
UnknownTokenList- is a list of unknown tokens.
  214spellcheck_and_report_unknown_words(UnknownTokenList, N) :-
  215	spellcheck:damerau_rules(UnknownTokenList, Candidates, NoSolutions),
  216	!,
  217	report_candidates(Candidates, N),
  218	report_nosolutions(NoSolutions, N).
  219
  220report_candidates([], _).
  221
  222report_candidates([Bad-Good | T], N) :-
  223	add_error_message(word, N-'', Bad, Good),
  224	report_candidates(T, N).
  225
  226report_nosolutions([], _).
  227
  228report_nosolutions([Bad | T], N) :-
  229	add_error_message(word, N-'', Bad, 'Use the prefix n:, v:, a: or p:.'),
  230	report_nosolutions(T, N).
 handle_unknown_words(+GuessOnOff:atom, +Tokens:list, -TokensWithPrefixes:list) is det
  235handle_unknown_words(_, [], [], _).
  236
  237handle_unknown_words(on, [SentenceIn|RestIn], [SentenceOut|RestOut], _) :-
  238	!,
  239	prefix_unknown_words(SentenceIn, SentenceOut),
  240	handle_unknown_words(on, RestIn, RestOut, _).
  241
  242handle_unknown_words(G, [Sentence|Rest], [Sentence|Rest], N) :-
  243	get_unknown_words(Sentence, UnknownTokens),
  244	spellcheck_and_report_unknown_words(UnknownTokens, N),
  245	NextN is N + 1,
  246	handle_unknown_words(G, Rest, Rest, NextN).
 get_unknown_words(+TokenList, -UnknownTokenList)
bug
- repeated unknowns are not removed. Ways to fix it:
  • sort/2 the list in the end
  • use some hashing technique
  255get_unknown_words(TokenList, UnknownTokenList) :-
  256	get_unknown_words(TokenList, [x, x], UnknownTokenList).
  257
  258get_unknown_words([], _, []).
  259
  260get_unknown_words([Token | TokensTail], [Left2, Left1], UnknownTokenList) :-
  261	get_right_context(TokensTail, Right1),
  262	(
  263	known_or_capitalized(Left2, Left1, Token, Right1) -> UnknownTokenList = UnknownTokenListTail
  264	;
  265	UnknownTokenList = [Token | UnknownTokenListTail]
  266	),
  267	get_unknown_words(TokensTail, [Left1, Token], UnknownTokenListTail).
 known_or_capitalized(+Left2:atom, +Left1:atom, +WordForm:atom, +Right1:atom) is det
  273known_or_capitalized(_, _, WordForm, _) :-
  274	chars:is_capitalized(WordForm),
  275	!.
  276
  277known_or_capitalized(Left2, Left1, WordForm, Right1) :-
  278	debug(known_word, "~w ~w *~w* ~w~n", [Left2, Left1, WordForm, Right1]),
  279	known_word(Left2, Left1, WordForm, Right1),
  280	debug(known_word, "ok~n", []).
 prefix_unknown_words(+TokenList, -TokenListWithPrefixes)
Add variable prefixes to unknown words. The prefixes are grounded by the first successful parse. This essentially amounts to guessing the word class (noun, verb, adjective, adverb) of unknown words.

Note that propernames are difficult to guess since they are not introduced by a marker (like 'a' or 'every'). Adding a variable prefix makes the variable unify also with function words (e.g. 'if'). This causes a performance problem and doesn't work anyway. So, for the time being, we simply don't guess propernames (eventhough propernames deserve guessing the most, being a very open class).

  296prefix_unknown_words(TokenList, TokenListWithPrefixes) :-
  297	prefix_unknown_words(TokenList, [x, x], TokenListWithPrefixes).
  298 
  299prefix_unknown_words([], _, []).
  300
  301prefix_unknown_words([Token | TokensTail], [Left2, Left1], TokenListWithPrefixes1) :-
  302	get_right_context(TokensTail, Right1),
  303	(
  304	known_or_capitalized(Left2, Left1, Token, Right1) ->
  305	TokenListWithPrefixes1 = [Token | TokenListWithPrefixes]
  306	;
  307	TokenListWithPrefixes1 = [unknowncat, :, Token | TokenListWithPrefixes]
  308	),
  309	prefix_unknown_words(TokensTail, [Left1, Token], TokenListWithPrefixes).
 get_right_context(+List, -FirstElement)
Returns the first element of the list, or 'x' if the list is empty.
bug
- This predicate is experimental.
  318get_right_context([], x).
  319
  320get_right_context([Head | _], Head).
 known_word(+Left2:atom, +Left1:atom, +Token:atom, +Right1:atom) is det
Token is "known" iff:

it is in quotation marks; it is prefixed; it is a prefix symbol; it is in the lexicon.

  332known_word(_, _, String, _) :-
  333    atom_concat('"', S, String),
  334    atom_concat(_, '"', S).
  335
  336known_word(Left2, ':', _, _) :-
  337	(
  338		functionwords:propername_prefix(Left2, _)
  339	;
  340		functionwords:noun_prefix(Left2, _)
  341	;
  342		functionwords:verb_prefix(Left2)
  343	;
  344		functionwords:modif_prefix(Left2)
  345	),
  346	!.
  347
  348known_word(_, _, Prefix, ':') :-
  349	(
  350		functionwords:propername_prefix(Prefix, _)
  351	;
  352		functionwords:noun_prefix(Prefix, _)
  353	;
  354		functionwords:verb_prefix(Prefix)
  355	;
  356		functionwords:modif_prefix(Prefix)
  357	),
  358	!.
  359
  360known_word(_, _, WordForm, _) :-
  361	is_in_lexicon:is_in_lexicon(WordForm).
  362
  363
  364%----------------------------------------------------------------------------
  365% new_npid(-NPID:number) is det.
  366%
  367% new_npid delivers the next idenfifier (ID) for an NP.
  368% NP IDs are used for anaphora resolution recency calculations.
  369%----------------------------------------------------------------------------
  370
  371new_npid(NewNPID) :-
  372	b_getval(npid, OldNPID),
  373	NewNPID is OldNPID + 1,
  374	b_setval(npid, NewNPID)