%---------------------------------------------------------------

:- module(extract_feats,
	  [rec_results_to_feature_vectors/3,
	   feat_for_surface_words/3,
	   feat_for_nl_value/3,
	   words_covered_by_surface_pattern/3,
	   hand_coded_surface_pattern_match_in_string/3]
      ).

%---------------------------------------------------------------


:- use_module('$REGULUS/Alterf/Prolog/classifier_utilities').

:- use_module('$REGULUS/Prolog/regulus_eval').

:- use_module('$REGULUS/PrologLib/utilities').

:- use_module(library(system)).
:- use_module(library(lists)).

%---------------------------------------------------------------

/*

rec_results_to_feature_vectors(+RecResultsAlist, +FeatureExtractionSpecAlist, +FeatVectors)

Create feature vector file.

1. RecResultsAlist

List of one or more terms of the form Key-File, containing Prolog-readable rec results produced
by real or text batchrec. See batchrec.pl and home/speech/CambridgePrologLib/batchrec_tools.pl.

2. FeatureExtractionSpecAlist

List of one or more terms, in format defined in classifier_trainer.pl.

3. FeatVectors

File of feat vectors, in format

wavfile_with_words_confidence_and_feats(Id, WordsConfidenceFeatsList)

where Id is the wavfile ID, and WordsConfidenceFeatsList is a list of one or more items of the form

[Words, Confidence, Feats]

where Words is a list of words (Prolog atoms), Confidence is the confidence score (integer) and
Feats is a list of features.

------------------------------------------------------------------------------------------

feat_for_surface_words(-Feature, +FeatureExtractionSpec, +Words)

Find a feature based on surface words.

1. FeatureExtractionSpec is a term in format defined in classifier_trainer.pl.

2. Words is a list of surface words.

3. Feature is a feature of a type permitted by FeatureExtractionSpec occurring in Words.

------------------------------------------------------------------------------------------

feat_for_nl_value(-Feature, +FeatureExtractionSpec, +NLVal)

Find a feature based on NLValue (aka LF).

1. FeatureExtractionSpec is a term in format defined in classifier_trainer.pl.

2. NLVal is a Nuance NLValue encoded as a Prolog term

3. Feature is a feature of a type permitted by FeatureExtractionSpec occurring in NLVal.

------------------------------------------------------------------------------------------

words_covered_by_surface_pattern(+Words, +SemanticAtom, -PossibleWordsCovered)

Find the words on which a surface pattern match was based. There could be more than one such set of words.

1. Words is a list of words.

2. SemanticAtom is the atom for which the surface pattern match is being carried out. This could be a number, decimal number or similar

3. PossibleWordsCovered is a (possibly empty) list of lists. Each list gives the words on which one possible match was based.

------------------------------------------------------------------------------------------

hand_coded_surface_pattern_match_in_string(+Pattern, +String, -WordsMatched)

Match a surface pattern to a string.

1. Pattern is a surface pattern.

2. Words is a list of words.

3. WordsMatched is the list of words (a subset of Words) that matches the pattern, if such a subset exists.

*/


rec_results_to_feature_vectors(RecResultsAlist, FeatureExtractionSpecAlist, FeatVectors) :-
	absolute_file_name(FeatVectors, FeatVectors1),
	format('~N-- Creating file of features ~w... ', [FeatVectors1]),

	open_alist(RecResultsAlist, read, SInAlist),
	open(FeatVectors1, write, SOut),
	rec_results_to_feature_vectors_stream(SInAlist, SOut, FeatureExtractionSpecAlist, 0),
	close_alist(SInAlist),
	close(SOut),

	format('done~n', []).

%---------------------------------------------------------------

words_covered_by_surface_pattern(Words, SemanticAtom, PossibleWordsCovered) :-
	apply_tagging_grammar(Words, Words1),
	append(['*start*' | Words1], ['*end*'], Words2),

	findall(
	ExpandedWordsMatched, 
	expanded_hand_coded_surface_pattern_match_in_string(Words2, SemanticAtom, ExpandedWordsMatched), 
	PossibleWordsCovered0),

	remove_duplicates(PossibleWordsCovered0, PossibleWordsCovered).

expanded_hand_coded_surface_pattern_match_in_string(Words, SemanticAtom, ExpandedWordsMatched) :-
	hand_coded_surface_pattern_match_in_string(Words, SemanticAtom, WordsMatched, atom),
	expand_using_tagging_grammar(WordsMatched, ExpandedWordsMatched).

%---------------------------------------------------------------

rec_results_to_feature_vectors_stream(SInAlist, SOut, FeatureExtractionSpecAlist, Counter) :-
	read_alist(SInAlist, ItemAlist),
	Counter1 is Counter + 1,
	Mod is Counter1 mod 100,
	(   Mod = 0 ->
	    format('~d ', [Counter1]), 
	    flush_output(user) ;
	    true
	),
	rec_results_to_feature_vectors_stream1(ItemAlist, SInAlist, SOut, FeatureExtractionSpecAlist, Counter1).

rec_results_to_feature_vectors_stream1(ItemAlist, _SInAlist, _SOut, _FeatureExtractionSpecAlist, _Counter) :-
	is_end_of_file_alist(ItemAlist),
	!.
rec_results_to_feature_vectors_stream1(ItemAlist, SInAlist, SOut, FeatureExtractionSpecAlist, Counter) :-
	rec_results_item_alist_to_feature_vectors(ItemAlist, SOut, FeatureExtractionSpecAlist),
	!,
	rec_results_to_feature_vectors_stream(SInAlist, SOut, FeatureExtractionSpecAlist, Counter).

rec_results_item_alist_to_feature_vectors(ItemAlist, SOut, FeatureExtractionSpecAlist) :-
	rec_result_alist_to_id_words_confidence_and_features_list(ItemAlist, FeatureExtractionSpecAlist, Id, WordsConfidenceFeatsList),
	format(SOut, "~N~q.~n", [wavfile_with_words_confidence_and_feats(Id, WordsConfidenceFeatsList)]),
	!.
rec_results_item_alist_to_feature_vectors(ItemAlist, SOut, FeatureExtractionSpecAlist) :-
	format('~N*** Error: bad call: ~w~n', [rec_results_item_alist_to_feature_vectors(ItemAlist, SOut, FeatureExtractionSpecAlist)]).

rec_result_alist_to_id_words_confidence_and_features_list([], _FeatureExtractionSpecAlist, _Id, []) :-
	!.
rec_result_alist_to_id_words_confidence_and_features_list([Key-Item | ItemAlistR], FeatureExtractionSpecAlist, Id, [[Words, Confidence, Feats] | ListR]) :-
	member(Key-FeatureExtractionSpec, FeatureExtractionSpecAlist),
	rec_result_to_id_words_confidence_and_features(Item, FeatureExtractionSpec, Id, Words, Confidence, Feats),
	!,
	rec_result_alist_to_id_words_confidence_and_features_list(ItemAlistR, FeatureExtractionSpecAlist, _SameId, ListR).

%---------------------------------------------------------------

rec_result_to_id_words_confidence_and_features(batchrec_item(Pairs), FeatureExtractionSpec, Id, Words, Confidence, Feats) :-
	member(wavfile=Id, Pairs),
	member(words=Words, Pairs),
	member(confidence=Confidence, Pairs),
	findall(Feat, feat_for_batchrec_pairs(Feat, FeatureExtractionSpec, Pairs), Feats),
	!.
rec_result_to_id_words_confidence_and_features(Item, FeatureExtractionSpec, Id, Words, Confidence, Feats) :-
	format('*** Error: bad call: ~w~n', [rec_result_to_id_words_confidence_and_features(Item, FeatureExtractionSpec, Id, Words, Confidence, Feats)]),
	fail.

% Every utterance has the 'no_info' feature - we want this for normalisation.
feat_for_batchrec_pairs(no_info, _FeatureExtractionSpec, _Pairs).
feat_for_batchrec_pairs(Feat, FeatureExtractionSpec, Pairs) :-
	member(confidence=Confidence, Pairs),
	feat_for_confidence_score(Feat, FeatureExtractionSpec, Confidence).
feat_for_batchrec_pairs(Feat, FeatureExtractionSpec, Pairs) :-
	member(words=Words, Pairs),
	feat_for_surface_words(Feat, FeatureExtractionSpec, Words).
feat_for_batchrec_pairs(Feat, FeatureExtractionSpec, Pairs) :-
	member(nl_value=NLVal, Pairs),
	feat_for_nl_value(Feat, FeatureExtractionSpec, NLVal).

%---------------------------------------------------------------

feat_for_confidence_score(Feat, FeatureExtractionSpec, Confidence) :-
	member(confidence, FeatureExtractionSpec),
	confidence_score_feat(Confidence, Feat).

confidence_score_feat(Confidence, confidence(From, To)) :-
	reference_confidence_scores(List),
	interval_in_list(List, Confidence, From, To).

reference_confidence_scores([0, 10, 20, 25, 30, 35, 40, 43, 45, 47, 50, 55, 60, 65, 70, 80, 101]).

interval_in_list([From, To | _R], Confidence, From, To) :-
	From =< Confidence, Confidence < To,
	!.
interval_in_list([_F | R], Confidence, From, To) :-
	interval_in_list(R, Confidence, From, To).

%---------------------------------------------------------------

feat_for_surface_words(unigram(Word), FeatureExtractionSpec, Words) :-
	member(unigrams, FeatureExtractionSpec),
	member(Word, Words).
feat_for_surface_words(bigram(Bigram), FeatureExtractionSpec, Words) :-
	member(bigrams, FeatureExtractionSpec),
	append(['*start*' | Words], ['*end*'], Words1),
	bigram_in_list(Bigram, Words1).
feat_for_surface_words(trigram(Bigram), FeatureExtractionSpec, Words) :-
	member(trigrams, FeatureExtractionSpec),
	append(['*start*' | Words], ['*end*'], Words1),
	trigram_in_list(Bigram, Words1).

feat_for_surface_words(class_unigram(Word), FeatureExtractionSpec, Words0) :-
	member(class_unigrams, FeatureExtractionSpec),
	apply_tagging_grammar(Words0, Words),
	member(Word, Words).
feat_for_surface_words(class_bigram(Bigram), FeatureExtractionSpec, Words0) :-
	member(class_bigrams, FeatureExtractionSpec),
	apply_tagging_grammar(Words0, Words),
	append(['*start*' | Words], ['*end*'], Words1),
	bigram_in_list(Bigram, Words1).
feat_for_surface_words(class_trigram(Bigram), FeatureExtractionSpec, Words0) :-
	member(class_trigrams, FeatureExtractionSpec),
	apply_tagging_grammar(Words0, Words),
	append(['*start*' | Words], ['*end*'], Words1),
	trigram_in_list(Bigram, Words1).
feat_for_surface_words(hand_coded_surface_pattern_match(Atom), FeatureExtractionSpec, Words0) :-
	member(hand_coded_surface_patterns, FeatureExtractionSpec),
	apply_tagging_grammar(Words0, Words),
	append(['*start*' | Words], ['*end*'], Words1),
	findall(Atom0, hand_coded_surface_pattern_match_in_string(Words1, Atom0, _WordsMatched, atom), Atoms),
	remove_duplicates(Atoms, UniqueAtoms),
	member(Atom, UniqueAtoms).	
feat_for_surface_words(hand_coded_surface_pattern(Rule), FeatureExtractionSpec, Words0) :-
	member(hand_coded_surface_pattern_rules, FeatureExtractionSpec),
	apply_tagging_grammar(Words0, Words),
	append(['*start*' | Words], ['*end*'], Words1),
	findall(Rule0, hand_coded_surface_pattern_match_in_string(Words1, Rule0, _WordsMatched, rule), Rules),
	remove_duplicates(Rules, UniqueRules),
	member(Rule, UniqueRules).	

bigram_in_list([A, B], [A, B | _Rest]).
bigram_in_list(Bigram, [_F | R]) :-
	bigram_in_list(Bigram, R).

trigram_in_list([A, B, C], [A, B, C | _Rest]).
trigram_in_list(Trigram, [_F | R]) :-
	trigram_in_list(Trigram, R).

%---------------------------------------------------------------

apply_tagging_grammar([], []) :-
	!.
apply_tagging_grammar(In, [Tag | NextOut]) :-
	tag_spanning_longest_substring(Tag, In, Next),
	!,
	apply_tagging_grammar(Next, NextOut).
apply_tagging_grammar([F | Next], [F | NextOut]) :-
	apply_tagging_grammar(Next, NextOut),
	!.
apply_tagging_grammar(In, Out) :-
	format('~N*** Error: bad call: ~w~n', [apply_tagging_grammar(In, Out)]),
	fail.

%---------------------------------------------------------------

expand_using_tagging_grammar([], []) :-
	!.
expand_using_tagging_grammar([F | Next], [F | NextOut]) :-
	atom(F),
	expand_using_tagging_grammar(Next, NextOut),
	!.
expand_using_tagging_grammar([Tag | R], Out) :-
	( number(Tag) ; \+ atomic(Tag) ),
	user:tagging_grammar(Tag, Out, RestOut),
	expand_using_tagging_grammar(R, RestOut),
	!.
expand_using_tagging_grammar(In, Out) :-
	format('~N*** Error: bad call: ~w~n', [expand_using_tagging_grammar(In, Out)]),
	fail.

%---------------------------------------------------------------

tag_spanning_longest_substring(BestTag, In, BestOut) :-
	findall(LengthOut-[Tag, Out], tag_and_remaining_length(Tag, In, Out, LengthOut), Triples),
	Triples \== [],
	keysort(Triples, SortedTriples),
	SortedTriples = [_BestLengthOut-[BestTag, BestOut] | _Rest],
	!.

tag_and_remaining_length(Tag, In, Out, LengthOut) :-
	user:tagging_grammar(Tag, In, Out),
	length(Out, LengthOut).

%---------------------------------------------------------------

feat_for_nl_value(Feat, FeatureExtractionSpec, NLVal) :-
	nl_val_to_lf_according_to_feature_extraction_spec(NLVal, FeatureExtractionSpec, LF),
	feat_for_lf(Feat, FeatureExtractionSpec, LF).

nl_val_to_lf_according_to_feature_extraction_spec(NLVal, FeatureExtractionSpec, LF) :-
	member(lf_postproc_pred=PostprocPred, FeatureExtractionSpec),
	(   member(nl_val_type=text, FeatureExtractionSpec) ->
	    regulus_eval_text(NLVal, LF, PostprocPred) ;
	    regulus_eval_speech(NLVal, LF, PostprocPred)
	),
	!.
nl_val_to_lf_according_to_feature_extraction_spec(NLVal, _FeatureExtractionSpec, LF) :-
	LF = NLVal,
	!.
nl_val_to_lf_according_to_feature_extraction_spec(NLVal, FeatureExtractionSpec, LF) :-
	format('~N*** Error: bad call: ~w~n', [nl_val_to_lf_according_to_feature_extraction_spec(NLVal, FeatureExtractionSpec, LF)]),
	fail.

feat_for_lf(post_processed_lf(LF), FeatureExtractionSpec, LF) :-
	member(post_processed_lf, FeatureExtractionSpec).

feat_for_lf(sem_atom(Atom), FeatureExtractionSpec, LF) :-
	member(sem_atoms, FeatureExtractionSpec),
	findall(Atom0, sem_atom_in_lf(LF, Atom0), Atoms),
	remove_duplicates(Atoms, UniqueAtoms),
	member(Atom, UniqueAtoms).

feat_for_lf(sem_triple(Triple), FeatureExtractionSpec, LF) :-
	member(sem_triples, FeatureExtractionSpec),
	sem_triple_in_lf(LF, Triple).

feat_for_lf(hand_coded_pattern_match(Atom), FeatureExtractionSpec, LF) :-
	member(hand_coded_patterns, FeatureExtractionSpec),
	findall(Atom0, hand_coded_pattern_match_in_lf(LF, Atom0, atom), Atoms),
	remove_duplicates(Atoms, UniqueAtoms),
	member(Atom, UniqueAtoms).	

feat_for_lf(hand_coded_pattern(Rule), FeatureExtractionSpec, LF) :-
	member(hand_coded_pattern_rules, FeatureExtractionSpec),
	findall(Rule0, hand_coded_pattern_match_in_lf(LF, Rule0, rule), Rules),
	remove_duplicates(Rules, UniqueRules),
	member(Rule, UniqueRules).	

%---------------------------------------------------------------

hand_coded_pattern_match_in_lf(Var, _TargetAtom, _AtomOrRule) :-
	var(Var),
	!,
	fail.
hand_coded_pattern_match_in_lf(Atom, TargetAtom, AtomOrRule) :-
	atomic(Atom),
	!,
	hand_coded_pattern_match_on_term(Atom, TargetAtom, AtomOrRule).
hand_coded_pattern_match_in_lf(T, TargetAtom, AtomOrRule) :-
	hand_coded_pattern_match_on_term(T, TargetAtom, AtomOrRule).
hand_coded_pattern_match_in_lf(T, TargetAtom, AtomOrRule) :-
	functor(T, _F, N),
	hand_coded_pattern_match_in_lf_args(N, T, TargetAtom, AtomOrRule).

hand_coded_pattern_match_in_lf_args(I, T, TargetAtom, AtomOrRule) :-
	I > 0,
	arg(I, T, Arg),
	hand_coded_pattern_match_in_lf(Arg, TargetAtom, AtomOrRule).
hand_coded_pattern_match_in_lf_args(I, T, TargetAtom, AtomOrRule) :-
	I > 1,
	I1 is I - 1,
	hand_coded_pattern_match_in_lf_args(I1, T, TargetAtom, AtomOrRule).

hand_coded_pattern_match_on_term(T, TargetAtom, atom) :-
	current_predicate(user:alterf_pattern/3),
	user:alterf_pattern(T, TargetAtom, _Example).
hand_coded_pattern_match_on_term(T, Rule1, rule) :-
	current_predicate(user:alterf_pattern/3),
	user:alterf_pattern(T1, TargetAtom1, Example),
	Rule1 = alterf_pattern(T1, TargetAtom1, Example),
	copy_term(Rule1, Rule),
	make_ground(Rule1),
	Rule = alterf_pattern(T, _TargetAtom, Example).

%---------------------------------------------------------------

hand_coded_surface_pattern_match_in_string(String, TargetAtom, WordsMatched, atom) :-
	current_predicate(user:alterf_surface_pattern/3),
	user:alterf_surface_pattern(Pattern, TargetAtom, Exceptions),
	hand_coded_surface_pattern_match_in_string_checking_exceptions(Pattern, String, WordsMatched, Exceptions).
hand_coded_surface_pattern_match_in_string(String, Rule1, WordsMatched, rule) :-
	current_predicate(user:alterf_surface_pattern/3),
	user:alterf_surface_pattern(Pattern1, TargetAtom1, Exceptions),
	Rule1 = alterf_surface_pattern(Pattern1, TargetAtom1, Exceptions),
	copy_term(Rule1, Rule),
	make_ground(Rule1),
	Rule = alterf_surface_pattern(Pattern, _TargetAtom, Exceptions),
	hand_coded_surface_pattern_match_in_string_checking_exceptions(Pattern, String, WordsMatched, Exceptions).

hand_coded_surface_pattern_match_in_string_checking_exceptions(Pattern, String, WordsMatched, Exceptions) :-
	hand_coded_surface_pattern_match_in_string(Pattern, String, WordsMatched),
	check_exceptions(Exceptions, Pattern, String).

hand_coded_surface_pattern_match_in_string(Pattern, String, WordsMatched) :-
	hand_coded_surface_pattern_match_in_string2(Pattern, String-_StringOut, WordsMatched-[]).
hand_coded_surface_pattern_match_in_string(Pattern, [_F | R], WordsMatched) :-
	hand_coded_surface_pattern_match_in_string(Pattern, R, WordsMatched).

hand_coded_surface_pattern_match_in_string2([], _String, WordsMatched-WordsMatched) :-
	!.
hand_coded_surface_pattern_match_in_string2(['...' | R], [F | R1]-StringOut, WordsMatched) :-
	(   hand_coded_surface_pattern_match_in_string2(R, [F | R1]-StringOut, WordsMatched) ;
	    hand_coded_surface_pattern_match_in_string2(['...' | R], R1-StringOut, WordsMatched)
	).
hand_coded_surface_pattern_match_in_string2([F | R], StringIn-StringOut, WordsMatchedIn-WordsMatchedOut) :-
	hand_coded_surface_pattern_match_in_string2(F, StringIn-StringNext, WordsMatchedIn-WordsMatchedNext),
	hand_coded_surface_pattern_match_in_string2(R, StringNext-StringOut, WordsMatchedNext-WordsMatchedOut).
hand_coded_surface_pattern_match_in_string2(Elt1/Elt2, String, WordsMatched) :-
	(   hand_coded_surface_pattern_match_in_string2(Elt1, String, WordsMatched) ;
	    hand_coded_surface_pattern_match_in_string2(Elt2, String, WordsMatched)
	).
% Don't count a match if it's in the scope of a 'not'.
hand_coded_surface_pattern_match_in_string2(not(Elt), StringIn-StringIn, WordsMatched-WordsMatched) :-
	\+ hand_coded_surface_pattern_match_in_string2(Elt, StringIn-_StringOut, _DiscardedWordsMatched-[]).
hand_coded_surface_pattern_match_in_string2(not_word(Elt), [F | R]-R, WordsMatchedOut-WordsMatchedOut) :-
	\+ hand_coded_surface_pattern_match_in_string2(Elt, [F]-[], _WordsMatched).
hand_coded_surface_pattern_match_in_string2(number(N), [N | R]-R, [N | WordsMatchedOut]-WordsMatchedOut) :-
	number(N).
hand_coded_surface_pattern_match_in_string2(Atom, [Atom | R]-R, [Atom | WordsMatchedOut]-WordsMatchedOut).

%---------------------------------------------------------------

check_exceptions(unless(List), Pattern, String) :-
	!,
	check_exceptions1(List, Pattern, String).
check_exceptions(_Exceptions, _Pattern, _String).

check_exceptions1([], _Pattern, _String).
check_exceptions1([F | R], Pattern, String) :-
	\+ hand_coded_surface_pattern_match_in_string(F, String, _WordsMatched),
	!,
	check_exceptions1(R, Pattern, String).