csv.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2022, VU University Amsterdam
    7                              CWI, Amsterdam,
    8                              SWI-Prolog Solutions b.v.
    9    All rights reserved.
   10
   11    Redistribution and use in source and binary forms, with or without
   12    modification, are permitted provided that the following conditions
   13    are met:
   14
   15    1. Redistributions of source code must retain the above copyright
   16       notice, this list of conditions and the following disclaimer.
   17
   18    2. Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in
   20       the documentation and/or other materials provided with the
   21       distribution.
   22
   23    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   24    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   25    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   26    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   27    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   28    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   29    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   30    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   31    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   33    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   34    POSSIBILITY OF SUCH DAMAGE.
   35*/
   36
   37:- module(csv,
   38          [ csv//1,                     % +Rows
   39            csv//2,                     % +Rows, +Options
   40
   41            csv_read_file/2,            % +File, -Data
   42            csv_read_file/3,            % +File, -Data, +Options
   43            csv_read_stream/3,          % +Stream, -Data, +Options
   44
   45            csv_read_file_row/3,        % +File, -Row, +Options
   46            csv_read_row/3,		% +Stream, -Row, +CompiledOptions
   47            csv_options/2,		% -Compiled, +Options
   48
   49            csv_write_file/2,           % +File, +Data
   50            csv_write_file/3,           % +File, +Data, +Options
   51            csv_write_stream/3          % +Stream, +Data, +Options
   52          ]).   53:- use_module(library(record),[(record)/1, op(_,_,record)]).   54
   55:- autoload(library(apply),[maplist/2]).   56:- use_module(library(debug),[debug/3]).   57:- autoload(library(error),[must_be/2,domain_error/2]).   58:- autoload(library(lists),[append/3]).   59:- autoload(library(option),[option/2,select_option/4]).   60:- autoload(library(pure_input),
   61	    [phrase_from_file/3,phrase_from_stream/2]).   62:- autoload(library(readutil),[read_line_to_codes/2]).   63:- autoload(library(dcg/basics),[string//1,eos//0]).   64
   65
   66/** <module> Process CSV (Comma-Separated Values) data
   67
   68This library parses and generates CSV data.   CSV data is represented in
   69Prolog as a list of rows. Each row   is  a compound term, where all rows
   70have the same name and arity.
   71
   72@tbd    Implement immediate assert of the data to avoid possible stack
   73        overflows.
   74@tbd    Writing creates an intermediate code-list, possibly overflowing
   75        resources.  This waits for pure output!
   76@see RFC 4180
   77*/
   78
   79:- predicate_options(csv//2, 2,
   80                     [ separator(code),         % must be code
   81                       strip(boolean),
   82                       ignore_quotes(boolean),
   83                       convert(boolean),
   84                       case(oneof([down,preserve,up])),
   85                       functor(atom),
   86                       arity(-nonneg),          % actually ?nonneg
   87                       match_arity(boolean)
   88                     ]).   89:- predicate_options(csv_read_file/3, 3,
   90                     [ pass_to(csv//2, 2),
   91                       pass_to(phrase_from_file/3, 3)
   92                     ]).   93:- predicate_options(csv_read_file_row/3, 3,
   94                     [ line(-integer),
   95                       pass_to(csv//2, 2),
   96                       pass_to(open/4, 4)
   97                     ]).   98:- predicate_options(csv_write_file/3, 3,
   99                     [ pass_to(csv//2, 2),
  100                       pass_to(open/4, 4)
  101                     ]).  102:- predicate_options(csv_write_stream/3, 3,
  103                     [ pass_to(csv//2, 2)
  104                     ]).  105
  106
  107:- record
  108    csv_options(separator:integer=0',,
  109                strip:boolean=false,
  110                ignore_quotes:boolean=false,
  111                convert:boolean=true,
  112                case:oneof([down,preserve,up])=preserve,
  113                functor:atom=row,
  114                arity:integer,
  115                match_arity:boolean=true,
  116                skip_header:atom).  117
  118
  119%!  csv_read_file(+File, -Rows) is det.
  120%!  csv_read_file(+File, -Rows, +Options) is det.
  121%
  122%   Read a CSV file into a list of   rows. Each row is a Prolog term
  123%   with the same arity. Options  is   handed  to  csv//2. Remaining
  124%   options  are  processed  by    phrase_from_file/3.  The  default
  125%   separator depends on the file name   extension and is =|\t|= for
  126%   =|.tsv|= files and =|,|= otherwise.
  127%
  128%   Suppose we want to create a predicate   table/6  from a CSV file
  129%   that we know contains 6 fields  per   record.  This  can be done
  130%   using the code below. Without the   option  arity(6), this would
  131%   generate a predicate table/N, where N   is  the number of fields
  132%   per record in the data.
  133%
  134%       ==
  135%       ?- csv_read_file(File, Rows, [functor(table), arity(6)]),
  136%          maplist(assert, Rows).
  137%       ==
  138
  139
  140csv_read_file(File, Rows) :-
  141    csv_read_file(File, Rows, []).
  142
  143csv_read_file(File, Rows, Options) :-
  144    default_separator(File, Options, Options1),
  145    make_csv_options(Options1, Record, RestOptions),
  146    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
  147
  148
  149default_separator(File, Options0, Options) :-
  150    (   option(separator(_), Options0)
  151    ->  Options = Options0
  152    ;   file_name_extension(_, Ext0, File),
  153        downcase_atom(Ext0, Ext),
  154        ext_separator(Ext, Sep)
  155    ->  Options = [separator(Sep)|Options0]
  156    ;   Options = Options0
  157    ).
  158
  159ext_separator(csv, 0',).
  160ext_separator(tsv, 0'\t).
  161
  162
  163%!  csv_read_stream(+Stream, -Rows, +Options) is det.
  164%
  165%   Read CSV data from Stream.  See also csv_read_row/3.
  166
  167csv_read_stream(Stream, Rows, Options) :-
  168    make_csv_options(Options, Record, _),
  169    phrase_from_stream(csv_roptions(Rows, Record), Stream).
  170
  171
  172%!  csv(?Rows)// is det.
  173%!  csv(?Rows, +Options)// is det.
  174%
  175%   Prolog DCG to `read/write' CSV data.  Options:
  176%
  177%       * separator(+Code)
  178%       The comma-separator.  Must be a character code.  Default is
  179%       (of course) the comma. Character codes can be specified
  180%       using the 0' notation. E.g., using =|separator(0';)|= parses
  181%       a semicolon separated file.
  182%
  183%       * ignore_quotes(+Boolean)
  184%       If =true= (default false), threat double quotes as a normal
  185%       character.
  186%
  187%       * strip(+Boolean)
  188%       If =true= (default =false=), strip leading and trailing
  189%       blank space.  RFC4180 says that blank space is part of the
  190%       data.
  191%
  192%       * skip_header(+CommentLead)
  193%       Skip leading lines that start with CommentLead.  There is
  194%       no standard for comments in CSV files, but some CSV files
  195%       have a header where each line starts with `#`.  After
  196%       skipping comment lines this option causes csv//2 to skip empty
  197%       lines.  Note that an empty line may not contain white space
  198%       characters (space or tab) as these may provide valid data.
  199%
  200%       * convert(+Boolean)
  201%       If =true= (default), use name/2 on the field data.  This
  202%       translates the field into a number if possible.
  203%
  204%       * case(+Action)
  205%       If =down=, downcase atomic values.  If =up=, upcase them
  206%       and if =preserve= (default), do not change the case.
  207%
  208%       * functor(+Atom)
  209%       Functor to use for creating row terms.  Default is =row=.
  210%
  211%       * arity(?Arity)
  212%       Number of fields in each row.  This predicate raises
  213%       a domain_error(row_arity(Expected), Found) if a row is
  214%       found with different arity.
  215%
  216%       * match_arity(+Boolean)
  217%       If =false= (default =true=), do not reject CSV files where
  218%       lines provide a varying number of fields (columns).  This
  219%       can be a work-around to use some incorrect CSV files.
  220
  221csv(Rows) -->
  222    csv(Rows, []).
  223
  224csv(Rows, Options) -->
  225    { make_csv_options(Options, Record, _) },
  226    csv_roptions(Rows, Record).
  227
  228csv_roptions(Rows, Record) -->
  229    { ground(Rows) },
  230    !,
  231    emit_csv(Rows, Record).
  232csv_roptions(Rows, Record) -->
  233    skip_header(Record),
  234    csv_data(Rows, Record).
  235
  236skip_header(Options) -->
  237    { csv_options_skip_header(Options, CommentStart),
  238      nonvar(CommentStart),
  239      atom_codes(CommentStart, Codes)
  240    },
  241    !,
  242    skip_header_lines(Codes),
  243    skip_blank_lines.
  244skip_header(_) -->
  245    [].
  246
  247skip_header_lines(CommentStart) -->
  248    string(CommentStart),
  249    !,
  250    (   string(_Comment),
  251        end_of_record
  252    ->  skip_header_lines(CommentStart)
  253    ).
  254skip_header_lines(_) -->
  255    [].
  256
  257skip_blank_lines -->
  258    eos,
  259    !.
  260skip_blank_lines -->
  261    end_of_record,
  262    !,
  263    skip_blank_lines.
  264skip_blank_lines -->
  265    [].
  266
  267csv_data([], _) -->
  268    eos,
  269    !.
  270csv_data([Row|More], Options) -->
  271    row(Row, Options),
  272    !,
  273    { debug(csv, 'Row: ~p', [Row]) },
  274    csv_data(More, Options).
  275
  276
  277row(Row, Options) -->
  278    fields(Fields, Options),
  279    { csv_options_functor(Options, Functor),
  280      Row =.. [Functor|Fields],
  281      functor(Row, _, Arity),
  282      check_arity(Options, Arity)
  283    }.
  284
  285check_arity(Options, Arity) :-
  286    csv_options_arity(Options, Arity),
  287    !.
  288check_arity(Options, _) :-
  289    csv_options_match_arity(Options, false),
  290    !.
  291check_arity(Options, Arity) :-
  292    csv_options_arity(Options, Expected),
  293    domain_error(row_arity(Expected), Arity).
  294
  295fields([F|T], Options) -->
  296    field(F, Options),
  297    (   separator(Options)
  298    ->  fields(T, Options)
  299    ;   end_of_record
  300    ->  { T = [] }
  301    ).
  302
  303field(Value, Options) -->
  304    "\"",
  305    { csv_options_ignore_quotes(Options, false) },
  306    !,
  307    string_codes(Codes),
  308    { make_value(Codes, Value, Options) }.
  309field(Value, Options) -->
  310    { csv_options_strip(Options, true) },
  311    !,
  312    stripped_field(Value, Options).
  313field(Value, Options) -->
  314    { csv_options_separator(Options, Sep) },
  315    field_codes(Codes, Sep),
  316    { make_value(Codes, Value, Options) }.
  317
  318
  319stripped_field(Value, Options) -->
  320    ws,
  321    (   "\"",
  322        { csv_options_strip(Options, false) }
  323    ->  string_codes(Codes),
  324        ws
  325    ;   { csv_options_separator(Options, Sep) },
  326        field_codes(Codes0, Sep),
  327        { strip_trailing_ws(Codes0, Codes) }
  328    ),
  329    { make_value(Codes, Value, Options) }.
  330
  331ws --> " ", !, ws.
  332ws --> "\t", !, ws.
  333ws --> "".
  334
  335strip_trailing_ws(List, Stripped) :-
  336    append(Stripped, WS, List),
  337    all_ws(WS).
  338
  339all_ws([]).
  340all_ws([32|T]) :- all_ws(T).
  341all_ws([9|T]) :- all_ws(T).
  342
  343
  344%!  string_codes(-Codes)
  345%
  346%   Process a double-quotes string where  the   quote  is escaped by
  347%   doubling it. Eats the terminating double-quote.
  348
  349string_codes(List) -->
  350    [H],
  351    (   { H == 0'" }
  352    ->  (   "\""
  353        ->  { List = [H|T] },
  354            string_codes(T)
  355        ;   { List = [] }
  356        )
  357    ;   { List = [H|T] },
  358        string_codes(T)
  359    ).
  360
  361field_codes([], Sep), [Sep] --> [Sep], !.
  362field_codes([], _), "\n" --> "\r\n", !.
  363field_codes([], _), "\n" --> "\n", !.
  364field_codes([], _), "\n" --> "\r", !.
  365field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
  366field_codes([], _) --> [].              % unterminated last record
  367
  368%!  make_value(+Codes, -Value, +Options) is det.
  369%
  370%   Convert a list of character codes to the actual value, depending
  371%   on Options.
  372
  373make_value(Codes, Value, Options) :-
  374    csv_options_convert(Options, Convert),
  375    csv_options_case(Options, Case),
  376    make_value(Convert, Case, Codes, Value).
  377
  378make_value(true, preserve, Codes, Value) :-
  379    !,
  380    name(Value, Codes).
  381make_value(true, Case, Codes, Value) :-
  382    !,
  383    (   number_string(Value, Codes)
  384    ->  true
  385    ;   make_value(false, Case, Codes, Value)
  386    ).
  387make_value(false, preserve, Codes, Value) :-
  388    !,
  389    atom_codes(Value, Codes).
  390make_value(false, down, Codes, Value) :-
  391    !,
  392    string_codes(String, Codes),
  393    downcase_atom(String, Value).
  394make_value(false, up, Codes, Value) :-
  395    string_codes(String, Codes),
  396    upcase_atom(String, Value).
  397
  398separator(Options) -->
  399    { csv_options_separator(Options, Sep) },
  400    [Sep].
  401
  402end_of_record --> "\n".			% Unix files
  403end_of_record --> "\r\n".               % DOS files
  404end_of_record --> "\r".                 % MacOS files
  405end_of_record --> eos.                  % unterminated last record
  406
  407
  408%!  csv_read_file_row(+File, -Row, +Options) is nondet.
  409%
  410%   True when Row is a row in File.  First unifies Row with the first
  411%   row in File. Backtracking  yields  the   second,  ...  row.  This
  412%   interface  is  an  alternative  to  csv_read_file/3  that  avoids
  413%   loading all rows in memory.  Note   that  this interface does not
  414%   guarantee that all rows in File have the same arity.
  415%
  416%   In addition to the  options   of  csv_read_file/3, this predicate
  417%   processes the option:
  418%
  419%     * line(-Line)
  420%     Line is unified with the 1-based line-number from which Row is
  421%     read.  Note that Line is not the physical line, but rather the
  422%     _logical_ record number.
  423
  424csv_read_file_row(File, Row, Options) :-
  425    default_separator(File, Options, Options1),
  426    make_csv_options(Options1, RecordOptions, Options2),
  427    select_option(line(Line), Options2, RestOptions, _),
  428    setup_call_cleanup(
  429        open(File, read, Stream, RestOptions),
  430        csv_read_stream_row(Stream, Row, Line, RecordOptions),
  431        close(Stream)).
  432
  433csv_read_stream_row(Stream, Row, Line, Options) :-
  434    between(1, infinite, Line),
  435    (   csv_read_row(Stream, Row0, Options),
  436        Row0 \== end_of_file
  437    ->  Row = Row0
  438    ;   !,
  439        fail
  440    ).
  441
  442
  443%!  csv_read_row(+Stream, -Row, +CompiledOptions) is det.
  444%
  445%   Read the next CSV record from Stream  and unify the result with Row.
  446%   CompiledOptions is created from  options   defined  for csv//2 using
  447%   csv_options/2. Row is unified with   `end_of_file` upon reaching the
  448%   end of the input.
  449
  450csv_read_row(Stream, Row, _Record) :-
  451    at_end_of_stream(Stream),
  452    !,
  453    Row = end_of_file.
  454csv_read_row(Stream, Row, Record) :-
  455    read_lines_to_codes(Stream, Codes, Record, even),
  456    phrase(row(Row0, Record), Codes),
  457    !,
  458    Row = Row0.
  459
  460read_lines_to_codes(Stream, Codes, Options, QuoteQuantity) :-
  461    read_line_to_codes(Stream, Codes0),
  462    Codes0 \== end_of_file,
  463    (   (   csv_options_ignore_quotes(Options, true)
  464        ;   check_quotes(Codes0, QuoteQuantity, even)
  465        )
  466    ->  Codes = Codes0
  467    ;   append(Codes0, [0'\n|Tail], Codes),
  468        read_lines_to_codes(Stream, Tail, Options, odd)
  469    ).
  470
  471check_quotes([], QuoteQuantity, QuoteQuantity) :-
  472    !.
  473check_quotes([0'"|T], odd, Result) :-
  474    !,
  475    check_quotes(T, even, Result).
  476check_quotes([0'"|T], even, Result) :-
  477    !,
  478    check_quotes(T, odd, Result).
  479check_quotes([_|T], QuoteQuantity, Result) :-
  480    check_quotes(T, QuoteQuantity, Result).
  481
  482
  483%!  csv_options(-Compiled, +Options) is det.
  484%
  485%   Compiled is the  compiled  representation   of  the  CSV  processing
  486%   options as they may be passed into   csv//2,  etc. This predicate is
  487%   used in combination with csv_read_row/3 to avoid repeated processing
  488%   of the options.
  489
  490csv_options(Compiled, Options) :-
  491    make_csv_options(Options, Compiled, _Ignored).
  492
  493
  494                /*******************************
  495                *             OUTPUT           *
  496                *******************************/
  497
  498%!  csv_write_file(+File, +Data) is det.
  499%!  csv_write_file(+File, +Data, +Options) is det.
  500%
  501%   Write a list of Prolog terms to a CSV file.  Options are given
  502%   to csv//2.  Remaining options are given to open/4.  The  default
  503%   separator depends on the file name   extension and is =|\t|= for
  504%   =|.tsv|= files and =|,|= otherwise.
  505
  506csv_write_file(File, Data) :-
  507    csv_write_file(File, Data, []).
  508
  509csv_write_file(File, Data, Options) :-
  510    must_be(list, Data),
  511    default_separator(File, Options, Options1),
  512    make_csv_options(Options1, OptionsRecord, RestOptions),
  513    setup_call_cleanup(
  514        open(File, write, Out, RestOptions),
  515        maplist(csv_write_row(Out, OptionsRecord), Data),
  516        close(Out)).
  517
  518csv_write_row(Out, OptionsRecord, Row) :-
  519    phrase(emit_row(Row, OptionsRecord), String),
  520    format(Out, '~s', [String]).
  521
  522emit_csv([], _) --> [].
  523emit_csv([H|T], Options) -->
  524    emit_row(H, Options),
  525    emit_csv(T, Options).
  526
  527emit_row(Row, Options) -->
  528    { Row =.. [_|Fields] },
  529    emit_fields(Fields, Options),
  530    "\r\n".                                     % RFC 4180 demands \r\n
  531
  532emit_fields([], _) -->
  533    "".
  534emit_fields([H|T], Options) -->
  535    emit_field(H, Options),
  536    (   { T == [] }
  537        ->  []
  538        ;   { csv_options_separator(Options, Sep) },
  539        [Sep],
  540        emit_fields(T, Options)
  541    ).
  542
  543emit_field(H, Options) -->
  544    { (   atom(H)
  545      ->  atom_codes(H, Codes)
  546      ;   string(H)
  547      ->  string_codes(H, Codes)
  548      )
  549    },
  550    !,
  551    (   { needs_quotes(H, Options) }
  552    ->  "\"", emit_string(Codes), "\""
  553    ;   emit_codes(Codes)
  554    ).
  555emit_field([], _) -->
  556    !,
  557    { atom_codes('[]', Codes) },
  558    emit_codes(Codes).
  559emit_field(H, _) -->
  560    { number_codes(H,Codes) },
  561    emit_codes(Codes).
  562
  563needs_quotes(Atom, _) :-
  564    sub_atom(Atom, _, _, _, '"'),
  565    !.
  566needs_quotes(Atom, _) :-
  567    sub_atom(Atom, _, _, _, '\n'),
  568    !.
  569needs_quotes(Atom, _) :-
  570    sub_atom(Atom, _, _, _, '\r'),
  571    !.
  572needs_quotes(Atom, Options) :-
  573    csv_options_separator(Options, Sep),
  574    char_code(Char, Sep),
  575    sub_atom(Atom, _, _, _, Char),
  576    !.
  577
  578emit_string([]) --> "".
  579emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
  580emit_string([H|T]) --> [H], emit_string(T).
  581
  582emit_codes([]) --> "".
  583emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
  584emit_codes([H|T]) --> [H], emit_codes(T).
  585
  586
  587%%     csv_write_stream(+Stream, +Data, +Options) is det.
  588%
  589%      Write  the  rows  in  Data  to    Stream.   This  is  similar  to
  590%      csv_write_file/3,  but  can  deal  with  data  that  is  produced
  591%      incrementally. The example  below  saves   all  answers  from the
  592%      predicate data/3 to File.
  593%
  594%        ==
  595%        save_data(File) :-
  596%           setup_call_cleanup(
  597%               open(File, write, Out),
  598%               forall(data(C1,C2,C3),
  599%                      csv_write_stream(Out, [row(C1,C2,C3)], [])),
  600%               close(Out)).
  601%        ==
  602
  603csv_write_stream(Stream, Data, Options) :-
  604    must_be(list, Data),
  605    make_csv_options(Options, OptionsRecord, _),
  606    maplist(csv_write_row(Stream, OptionsRecord), Data)