ntokkie.pl

    2% the new tokkie.pl, by Johan Bos
    3
    4/* ========================================================================
    5   File Search Paths
    6======================================================================== */
    7
    8:- prolog_load_context(file,File),
    9   absolute_file_name('..',X,[relative_to(File),file_type(directory)]),
   10   asserta(user:file_search_path(candc,X)).   11
   12user:file_search_path(semlib,     candc(lib)).
   13user:file_search_path(boxer,      candc(boxer)).
   14
   15:- set_prolog_flag(double_quotes,codes).   16
   17/* ========================================================================
   18   Dynamic Predicates
   19======================================================================== */
   20
   21:- dynamic split/7, title/1.   22
   23
   24/* ========================================================================
   25   Load other libraries
   26======================================================================== */
   27
   28:- use_module(library(lists),[member/2,append/3,reverse/2]).   29:- use_module(library(readutil),[read_stream_to_codes/2]).   30:- use_module(semlib(abbreviations),[iAbb/2,tAbb/2]).   31:- use_module(semlib(errors),[error/2,warning/2]).   32:- use_module(semlib(options),[option/2,parseOptions/2,setOption/3,
   33                               showOptions/1,setDefaultOptions/1]).   34
   35
   36/* ========================================================================
   37   Main
   38======================================================================== */
   39
   40tokkie:-
   41   option(Option,do), 
   42   member(Option,['--help']), !, 
   43   help.
   44
   45tokkie:-
   46   openInput(InStream),
   47   openOutput(OutStream), !,
   48   read_stream_to_codes(InStream,Codes),
   49   close(InStream),
   50   initTokkie,
   51   readLines(Codes,0,1,OutStream,Tokens),
   52   outputIOB(Codes,Tokens,OutStream),
   53   close(OutStream).
   54
   55tokkie:-
   56   setOption(tokkie,'--help',do), !,
   57   help.
   58
   59
   60/* ----------------------------------------------------------------------
   61   Read lines
   62---------------------------------------------------------------------- */
   63
   64readLines(Codes1,I1,S1,Stream,[Tokens|L]):-
   65   begSent(Codes1,I1,Codes2,I2), !,       % determine begin of a new sentence
   66   endSent(Codes2,I2,Codes3,I3,Rest,[]),  % determine end of this sentence  
   67%  format(Stream,'sen(~p,~p,~s).~n',[I2,I3,Codes3]),
   68%  write(Codes3),nl,
   69   tokenise(Codes3,I2,I2,T-T,Tokens),     % split sentence into tokens
   70   outputTokens(Tokens,S1,Stream),
   71   S2 is S1 + 1,                          % increase sentence counter
   72   readLines(Rest,I3,S2,Stream,L).        % process remaining of document
   73
   74readLines(_,_,_,_,[]).
   75
   76
   77/* ----------------------------------------------------------------------
   78   Determine beginning of sentence
   79---------------------------------------------------------------------- */
   80
   81begSent([Sep|C1],I1,C2,I3):- 
   82   sep(Sep), !,               % skip space, tab or newline
   83   I2 is I1 + 1,
   84   begSent(C1,I2,C2,I3).
   85
   86begSent([C|L],I,[C|L],I).
   87
   88
   89/* ----------------------------------------------------------------------
   90   Determine end of sentence
   91
   92   endSent(+CodesI,             % Input string
   93           +CurrentPosition,    % Current character position
   94           +CodesO,             % Output string (until sentence boundary)
   95           +BoundaryPosition,   % Character position of boundary
   96           +CodesR,             % Rest string
   97           +CodesLast)          % Last token
   98
   99---------------------------------------------------------------------- */
  100
  101endSent([],I,[],I,[],_):- !.
  102
  103% Case 1: A full stop after a space 
  104%         --> sentence boundary.
  105endSent([46|Rest],I1,[46],I2,Rest,[]):- !, 
  106   I2 is I1 + 1.
  107
  108% Case 2: full stop before a quote followed by a space
  109%         --> sentence boundary
  110endSent([46,Q1,Q2,X|Rest],I1,[46,Q1,Q2],I2,[X|Rest],_):- 
  111   \+ alphanum(X), quote(Q1), quote(Q2), !, I2 is I1 + 3.
  112
  113endSent([46,Q,X|Rest],I1,[46,Q],I2,[X|Rest],_):- 
  114   \+ alphanum(X), quote(Q), !, I2 is I1 + 2.
  115
  116% Case 3: full stop, but no sentence boundary
  117% 
  118endSent([C|C1],I1,[C|C2],I3,Rest,Last):- 
  119   noSentenceBoundary([C],C1,Last), !,
  120   I2 is I1 + 1,
  121   endSent(C1,I2,C2,I3,Rest,[C|Last]).
  122
  123% Case 4: A full stop/question/exclemation mark after a non-abbreviation 
  124%         --> sentence boundary
  125endSent([End|Rest],I1,[End],I2,Rest,_):- 
  126   member(End,[46,63,33]), !, 
  127   I2 is I1 + 1.
  128
  129endSent([46|Rest],I1,[46],I2,Rest,_):- !, 
  130   I2 is I1 + 1.
  131
  132endSent([C|C1],I1,[C|C2],I3,Rest,Last):-
  133   alphanum(C), !,
  134   I2 is I1 + 1,
  135   endSent(C1,I2,C2,I3,Rest,[C|Last]).
  136
  137endSent([C|C1],I1,[C|C2],I3,Rest,_):-
  138   I2 is I1 + 1,
  139   endSent(C1,I2,C2,I3,Rest,[]).
  140
  141
  142/* ----------------------------------------------------------------------
  143   Cases describing NO sentence boundaries
  144
  145   noSentenceBoundary(Char,     % Character that could signal boundary
  146                      Next,     % Codes following
  147                      Last)     % Last token
  148
  149---------------------------------------------------------------------- */
  150% Case 1: full stop after uppercase one-character token (i.e. initial)
  151noSentenceBoundary(".",_,Last):- Last = [Upper], upper(Upper).
  152% Case 2: full stop after a title 
  153noSentenceBoundary(".",_,Last):- title(Last).
  154% Case 2: full stop after an abbrev 
  155noSentenceBoundary(".",_,Last):- member(46,Last).
  156% Case 3: full stop before number
  157noSentenceBoundary(".",[N|_],_):- num(N).
  158
  159
  160/* ----------------------------------------------------------------------
  161   Split Line into Tokens
  162---------------------------------------------------------------------- */
  163
  164% Nothing left to do, no tokens in queue
  165%
  166tokenise([],_,_,Sofar-[],[]):- Sofar=[], !.
  167
  168% Nothing left to do, still a token present (input empty): store last token 
  169%
  170tokenise([],CurrentPos,StartPos,Sofar-[],[tok(StartPos,CurrentPos,Sofar)]):- !.
  171
  172% Separator follows separator
  173%
  174tokenise([Sep|Codes],CurrentPos,_,T1-T2,Tokens):-
  175   sep(Sep), T2=[], T1=[], !,
  176   Pos is CurrentPos + 1, 
  177   tokenise(Codes,Pos,Pos,T-T,Tokens).
  178
  179% Separator follows token
  180%
  181tokenise([Sep|Codes],CurrentPos,StartPos,Sofar-Tail,[Token|Tokens]):-
  182   sep(Sep), !, Tail = [],
  183   Token = tok(StartPos,CurrentPos,Sofar), 
  184   Pos is CurrentPos + 1, 
  185   tokenise(Codes,Pos,Pos,T-T,Tokens).
  186
  187% Last character is a split, nothing in the queue: store last character
  188%
  189tokenise(Input,CurrentPos,_,Sofar-[],[Token|Tokens]):- 
  190   final(Input,Head,Rest,Len), Sofar = [], !,
  191   FinalPos is CurrentPos + Len,
  192   Token = tok(CurrentPos,FinalPos,Head),
  193   tokenise(Rest,FinalPos,FinalPos,T-T,Tokens).
  194
  195% Last character is a split, store item in the queue and last character
  196%
  197tokenise(Input,CurrentPos,StartPos,Sofar-[],[Token1,Token2|Tokens]):- 
  198   final(Input,Head,Rest,Len), !,
  199   FinalPos is CurrentPos + Len,
  200   Token1 = tok(StartPos,CurrentPos,Sofar),
  201   Token2 = tok(CurrentPos,FinalPos,Head),
  202   tokenise(Rest,FinalPos,FinalPos,T-T,Tokens).
  203
  204% Do not perform a split
  205%
  206tokenise(Input,CurrentPos,StartPos,OldSofar,Tokens):-
  207   dontsplit(Input,Rest,Diff,OldSofar,NewSofar), !,
  208   Pos is CurrentPos + Diff, 
  209   tokenise(Rest,Pos,StartPos,NewSofar,Tokens).
  210
  211
  212% Perform a token split operation
  213%
  214tokenise(Input,CurrentPos,StartPos,Sofar-Tail,[Token|Tokens]):-
  215   trysplit(Input,Left,Right,Rest,LenLeft,LenRight), !,
  216%  format('Input: ~s~n',[Input]),
  217%  format('Left: ~s~n',[Left]),
  218%  format('Right: ~s~n',[Right]),
  219%  format('Rest: ~s~n',[Rest]),
  220   Pos is CurrentPos + LenLeft,
  221   NewPos is Pos + LenRight,
  222   Tail = Left,
  223   Token = tok(StartPos,Pos,Sofar),    
  224   append(Right,NewTail,New),
  225   tokenise(Rest,NewPos,Pos,New-NewTail,Tokens).
  226
  227% Do nothing but collect new token
  228%
  229tokenise([X|Codes],CurrentPos,StartPos,Sofar-Tail,Tokens):-
  230   Pos is CurrentPos + 1, 
  231   Tail = [X|NewTail],
  232   tokenise(Codes,Pos,StartPos,Sofar-NewTail,Tokens).
  233
  234
  235/* ----------------------------------------------------------------------
  236   Output Tokens
  237---------------------------------------------------------------------- */
  238
  239outputTokens(Tokens,S,Stream):-
  240   option('--mode',poor), !,
  241   printTokens(Tokens,S,1,Stream).
  242
  243outputTokens(Tokens,S,Stream):-
  244   option('--mode',rich), !,
  245   printTokens(Tokens,S,1,Stream).
  246
  247outputTokens(_,_,_).
  248
  249
  250/* ----------------------------------------------------------------------
  251   Wrapper IOB format
  252---------------------------------------------------------------------- */
  253
  254outputIOB(Codes,Tokens,Stream):-
  255   option('--mode',iob), !,
  256   printIOB(Codes,0,Tokens,Stream).
  257
  258outputIOB(_,_,_).
  259
  260
  261/* ----------------------------------------------------------------------
  262   Output IOB format
  263---------------------------------------------------------------------- */
  264
  265printIOB([],_,_,_).
  266
  267printIOB([X|L],N1,TokenSet,Stream):-
  268   member([tok(N1,_,Tok)|_],TokenSet), !, Tag = 'S',
  269   tupleIOB(N1,X,Tag,Tok,Stream),
  270   N2 is N1 + 1,
  271   printIOB(L,N2,TokenSet,Stream).
  272
  273printIOB([X|L],N1,TokenSet,Stream):-
  274   member(Tokens,TokenSet),
  275   member(tok(N1,_,Tok),Tokens), !, Tag = 'T',
  276   tupleIOB(N1,X,Tag,Tok,Stream),
  277   N2 is N1 + 1,
  278   printIOB(L,N2,TokenSet,Stream).
  279
  280printIOB([X|L],N1,TokenSet,Stream):-
  281   member(Tokens,TokenSet),
  282   member(tok(Start,End,_),Tokens), N1 > Start, N1 < End, !, Tag = 'I',
  283   tupleIOB(N1,X,Tag,[],Stream),
  284   N2 is N1 + 1,
  285   printIOB(L,N2,TokenSet,Stream).
  286
  287printIOB([X|L],N1,TokenSet,Stream):-
  288   Tag = 'O',
  289   tupleIOB(N1,X,Tag,[],Stream),
  290   N2 is N1 + 1,
  291   printIOB(L,N2,TokenSet,Stream).
  292
  293
  294/* ----------------------------------------------------------------------
  295   Tuple IOB format
  296---------------------------------------------------------------------- */
  297
  298tupleIOB(_,X,Tag,_,Stream):- 
  299   option('--format',txt), !,
  300   format(Stream,'~p ~p~n',[X,Tag]).
  301
  302tupleIOB(N,X,Tag,Tok,Stream):- 
  303   option('--format',prolog), !,
  304   format(Stream,'tok(~p,\'~p\'). % ~p ~s~n',[X,Tag,N,Tok]).
  305
  306
  307/* ----------------------------------------------------------------------
  308   Print Tokens
  309---------------------------------------------------------------------- */
  310
  311printTokens([],_,_,_). 
  312
  313printTokens([tok(_,_,Tok)],_,_,Stream):- 
  314   option('--mode',poor), !,
  315   format(Stream,'~s~n',[Tok]). 
  316
  317printTokens([tok(I,J,Tok)|L],S,T1,Stream):- 
  318   option('--format',prolog),
  319   option('--mode',rich), !,
  320   Index is S*1000+T1,
  321   format(Stream,'tok(~p, ~p, ~p, ~s).~n',[I,J,Index,Tok]), 
  322   T2 is T1+1,
  323   printTokens(L,S,T2,Stream).
  324
  325printTokens([tok(I,J,Tok)|L],S,T1,Stream):- 
  326   option('--format',txt),
  327   option('--mode',rich), !,
  328   Index is S*1000+T1,
  329   format(Stream,'~p ~p ~p ~s~n',[I,J,Index,Tok]), 
  330   T2 is T1+1,
  331   printTokens(L,S,T2,Stream).
  332
  333printTokens([tok(_,_,Tok)|L],S,T,Stream):- 
  334   option('--mode',poor), !,
  335   format(Stream,'~s ',[Tok]), 
  336   printTokens(L,S,T,Stream).
  337
  338
  339/* ----------------------------------------------------------------------
  340   Type checking
  341---------------------------------------------------------------------- */
  342
  343sep(10).    % new line
  344sep(13).    % new line
  345sep(32).    % space
  346sep(9).     % tab
  347sep(160).   % nbsp (non-breaking space)
  348sep(8194).  % en space
  349sep(8195).  % em space
  350
  351alphanum(X):- alpha(X), !.
  352alphanum(X):- num(X), !.
  353
  354alpha(62):- !.                         %%% '>' (end of markup)
  355alpha(X):- upper(X), !.
  356alpha(X):- lower(X), !.
  357
  358upper(X):- number(X), X > 64, X < 91, !.
  359upper(X):- var(X), member(X,"ABCDEFGHIJKLMNOPQRSTUVWXYZ").
  360
  361lower(X):- number(X), X > 96, X < 123, !.
  362lower(X):- var(X), member(X,"abcdefghijklmnopqrstuvwxyz").
  363
  364num(X):- number(X), X > 47, X < 58, !.
  365num(X):- var(X), member(X,"0123456789").
  366
  367
  368/* ----------------------------------------------------------------------
  369   Rules for splitting tokens
  370   split(+Left,+ConditionsOnLeft,+Right,+ConditionsOnRight,+Context)
  371---------------------------------------------------------------------- */
  372
  373split(`can`,[], "not",[], []).
  374split([_],[], "n't",[], []).
  375split([_],[], "'ll",[], []).
  376split([_],[], "'ve",[], []).
  377split([_],[], "'re",[], []).
  378
  379split([_],[], "'m",[], []).
  380split([_],[], "'d",[], []).
  381split([_],[], "'s",[], []).
  382
  383split([N],[num(N)],   [], [], "%").
  384split("%",[],         ",",[],[]).
  385split(")",[],         ",",[],[]).
  386
  387split([N],[num(N)],   ",",[], [32]).
  388split([N],[num(N)],   ",",[], [10]).
  389split([A],[alpha(A)], [], [], ",").
  390split([_],[],         ";",[], []).
  391split([_],[],         ":",[], []).
  392split([_],[],         [],[], ")").
  393%split([_],[],         ")",[], []).
  394split([_],[],         "]",[], []).
  395
  396split("$",[],   [N],[num(N)], []).     % dollar
  397split([163],[], [N],[num(N)], []).     % pound
  398split([165],[], [N],[num(N)], []).     % yen
  399split("(",[],   [X],[alphanum(X)], []).
  400split("[",[],   [X],[alphanum(X)], []).
  401
  402split([_],[],         [Q],[quote(Q)], []).
  403split([Q],[quote(Q)], [X],[alphanum(X)], []).
  404
  405
  406/* ----------------------------------------------------------------------
  407   Exceptions (do not split)
  408---------------------------------------------------------------------- */
  409
  410dontsplit(Input,Rest,N,Old-OldTail,Old-NewTail):- 
  411   nosplit(Left,N),
  412   append(Left,Rest,Input), !,
  413   append(Left,NewTail,OldTail).
  414
  415nosplit("hi'it",5).
  416nosplit("e.g.",4).
  417nosplit([79,Q,U],3):- rsq(Q), upper(U).   % Irish names
  418
  419
  420/* ----------------------------------------------------------------------
  421   Initialisation
  422---------------------------------------------------------------------- */
  423
  424initTokkie:-  
  425   initTitles,
  426   initSplitRules.
  427
  428initTitles:-
  429   option('--language',Language), !,
  430   findall(Title,
  431           ( tAbb(Language,Title),
  432             reverse(Title,Reversed),
  433             assertz(title(Reversed)) ),
  434           _).
  435          
  436initSplitRules:-
  437   findall(Ri,
  438          ( split(Le,CondLe,Ri,CondRi,Context),
  439            length(Le,LenLe),
  440            length(Ri,LenRi),
  441            assertz(split(Le,LenLe,CondLe,Ri,LenRi,CondRi,Context)) ),
  442          _).
  443
  444
  445/* ----------------------------------------------------------------------
  446   Rules for final tokens
  447---------------------------------------------------------------------- */
  448
  449final("?", "?", [], 1).
  450final(".", ".", [], 1).
  451
  452final([46,Q],[46], [Q],1):- quote(Q).
  453
  454
  455/* ----------------------------------------------------------------------
  456   Try a splitting rule on the input
  457---------------------------------------------------------------------- */
  458
  459trysplit(Input,Left,Right,Rest,LenLeft,LenRight):-
  460   split(Left,LenLeft,CondsLeft,Right,LenRight,CondsRight,RightContext),
  461   append(Left,Middle,Input), 
  462   checkConds(CondsLeft),  
  463   append(Right,Rest,Middle), 
  464   checkConds(CondsRight),   
  465   append(RightContext,_,Rest), !.
  466
  467
  468/* ----------------------------------------------------------------------
  469   Check Conditions
  470---------------------------------------------------------------------- */
  471
  472checkConds([]).
  473checkConds([C|L]):- call(C), !, checkConds(L).
  474
  475
  476/* ----------------------------------------------------------------------------------
  477   Codes for right single quotation marks (used in genitives)
  478---------------------------------------------------------------------------------- */
  479
  480rsq(39).
  481rsq(8217).
  482
  483
  484/* ----------------------------------------------------------------------------------
  485   Codes for single-character quotes
  486---------------------------------------------------------------------------------- */
  487
  488quote(34).    %%% "
  489quote(39).    %%% '
  490quote(96).    %%% `
  491quote(8216).  %%% left single quotation mark
  492quote(8217).  %%% right single quotation mark
  493quote(8218).  %%% low single quotation mark
  494quote(8220).  %%% left double quotation mark
  495quote(8221).  %%% right double quotation mark
  496quote(8222).  %%% low double quotation mark
  497
  498
  499/* ----------------------------------------------------------------------------------
  500   Codes for double quotes
  501---------------------------------------------------------------------------------- */
  502
  503quotes(96).    %%% ``
  504quotes(39).    %%% ''
  505quotes(8216).
  506quotes(8217).
  507quotes(8218).
  508
  509
  510
  511/* =======================================================================
  512   Open Input File
  513========================================================================*/
  514
  515openInput(Stream):-
  516   option('--stdin',dont),
  517   option('--input',File),
  518   exists_file(File), !,
  519   open(File,read,Stream,[encoding(utf8)]).
  520
  521openInput(Stream):-
  522   option('--stdin',do), 
  523   set_prolog_flag(encoding,utf8),
  524   warning('reading from standard input',[]),
  525   prompt(_,''),
  526   Stream = user_input.
  527
  528
  529/* =======================================================================
  530   Open Output File
  531========================================================================*/
  532
  533openOutput(Stream):-
  534   option('--output',Output),
  535   atomic(Output),
  536   \+ Output=user_output,
  537   ( access_file(Output,write), !,
  538     open(Output,write,Stream,[encoding(utf8)])
  539   ; error('cannot write to specified file ~p',[Output]),
  540     Stream=user_output ), !.
  541
  542openOutput(user_output).
  543
  544
  545/* =======================================================================
  546   Help
  547========================================================================*/
  548
  549help:-
  550   option('--help',do), !,
  551   format(user_error,'usage: tokkie [options]~n~n',[]),
  552   showOptions(tokkie).
  553
  554help:-
  555   option('--help',dont), !.
  556
  557
  558/* =======================================================================
  559   Definition of start
  560========================================================================*/
  561
  562start:-
  563   current_prolog_flag(argv,[_Comm|Args]),
  564   setDefaultOptions(tokkie), 
  565   parseOptions(tokkie,Args),
  566   tokkie, !,
  567   halt.
  568
  569start:- 
  570   error('tokkie failed',[]), 
  571   halt