:- ensure_loaded('$REGULUS/PrologLib/compatibility'). %--------------------------------------------------------------- :- module(orthography_process_text, [init_orthography_process_text/1, orthography_process_string/2, orthography_process_file/3, test_orthography_process_text/1 ] ). %--------------------------------------------------------------- :- use_module('$REGULUS/Prolog/generate'). :- use_module('$REGULUS/Prolog/regulus_utilities'). :- use_module('$REGULUS/PrologLib/utilities'). :- use_module(library(lists)). %--------------------------------------------------------------- test_orthography_process_text(0) :- %Sent = "tu te es inqui�t�s", Sent = "l avocat", test_orthography_process_sent(Sent), !. test_orthography_process_text(ez_small) :- orthography_process_file('$ACCEPT/MT/Europarl/Generated/europarl_ez_transformed_small.txt', '$REGULUS/PrologLib/CorpusTools/french_orthography.pl', '$ACCEPT/MT/Europarl/Generated/europarl_ez_orthography_processed_small.txt'). %--------------------------------------------------------------- test_orthography_process_sent(Sent) :- init_orthography_process_text('$REGULUS/PrologLib/CorpusTools/french_orthography.pl'), format('~NIn : ~s~n', [Sent]), orthography_process_string(Sent, Sent1), format('~NTrans: ~s~n', [Sent1]). %--------------------------------------------------------------- init_orthography_process_text(File) :- tmp_regulus_file('compiled_orthography.pl', CompiledFile), safe_absolute_file_name(File, AbsFile), safe_absolute_file_name(CompiledFile, AbsCompiledFile), compile_orthography_file_or_files(AbsFile, AbsCompiledFile), format('~N--- Compiled orthography rule file ~w~n', [AbsFile]), safe_compile(orthography_rules, AbsCompiledFile), format('~N--- Loaded compiled orthography rule file ~w~n', [AbsCompiledFile]), !. %--------------------------------------------------------------- orthography_process_file(InFile, OrthographyRulesFile, OutFile) :- absolute_file_name(InFile, AbsInFile), absolute_file_name(OutFile, AbsOutFile), init_orthography_process_text(OrthographyRulesFile), read_unicode_file_to_atom_list(AbsInFile, InList), length(InList, NIn), format('~N--- Read file (~d lines) ~w~n', [NIn, AbsInFile]), transform_sents_in_list(InList, OutList, 0, 0-NOut), write_out_transformed_list_to_file(OutList, AbsOutFile, NIn, NOut), !. write_out_transformed_list_to_file(List, File, NIn, NOut) :- open(File, write, S, [encoding('UTF-8'), encoding_signature(true)]), write_out_transformed_list_to_stream(List, S), close(S), PC is (100.0 * NOut) / NIn, format('~N--- Written file (~d lines altered = ~1f%) ~w~n', [NOut, PC, File]), !. write_out_transformed_list_to_stream([], _S). write_out_transformed_list_to_stream([F | R], S) :- format(S, '~N~w~n', [F]), !, write_out_transformed_list_to_stream(R, S). transform_sents_in_list([], [], _I, CIn-CIn). transform_sents_in_list([F | R], Out, I, CIn-COut) :- orthography_process_string(F, F1), Out = [F1 | R1], ( F \== F1 -> CNext is CIn + 1, format('+', []) %format('~N~w~n', [F1]) ; otherwise -> CNext = CIn, format('-', []) ), I1 is I + 1, ( 0 is I1 mod 100 -> format(' (~d)~n', [I1]) ; otherwise -> true ), flush_output(user), !, transform_sents_in_list(R, R1, I1, CNext-COut). %--------------------------------------------------------------- orthography_process_string(AtomIn, AtomOut) :- atom_codes(AtomIn, Str), fix_orthography_simple_on_string(Str, orthography_rules, Str1), initial_uppercase_string(Str1, Str2), atom_codes(AtomOut, Str2), !.