Mercurial > hg > CbC > CbC_gcc

//Written in the D programming language

/**
 * Implements functionality to read Comma Separated Values and its variants
 * from an input range of $(D dchar).
 *
 * Comma Separated Values provide a simple means to transfer and store
 * tabular data. It has been common for programs to use their own
 * variant of the CSV format. This parser will loosely follow the
 * $(HTTP tools.ietf.org/html/rfc4180, RFC-4180). CSV input should adhere
 * to the following criteria (differences from RFC-4180 in parentheses):
 *
 * $(UL
 *     $(LI A record is separated by a new line (CRLF,LF,CR))
 *     $(LI A final record may end with a new line)
 *     $(LI A header may be provided as the first record in input)
 *     $(LI A record has fields separated by a comma (customizable))
 *     $(LI A field containing new lines, commas, or double quotes
 *          should be enclosed in double quotes (customizable))
 *     $(LI Double quotes in a field are escaped with a double quote)
 *     $(LI Each record should contain the same number of fields)
 *   )
 *
 * Example:
 *
 * -------
 * import std.algorithm;
 * import std.array;
 * import std.csv;
 * import std.stdio;
 * import std.typecons;
 *
 * void main()
 * {
 *     auto text = "Joe,Carpenter,300000\nFred,Blacksmith,400000\r\n";
 *
 *     foreach (record; csvReader!(Tuple!(string, string, int))(text))
 *     {
 *         writefln("%s works as a %s and earns $%d per year",
 *                  record[0], record[1], record[2]);
 *     }
 *
 *     // To read the same string from the file "filename.csv":
 *
 *     auto file = File("filename.csv", "r");
 *     foreach (record;
 *         file.byLine.joiner("\n").csvReader!(Tuple!(string, string, int)))
 *     {
 *         writefln("%s works as a %s and earns $%d per year",
 *                  record[0], record[1], record[2]);
 *     }
 }
 * }
 * -------
 *
 * When an input contains a header the $(D Contents) can be specified as an
 * associative array. Passing null to signify that a header is present.
 *
 * -------
 * auto text = "Name,Occupation,Salary\r"
 *     "Joe,Carpenter,300000\nFred,Blacksmith,400000\r\n";
 *
 * foreach (record; csvReader!(string[string])
 *         (text, null))
 * {
 *     writefln("%s works as a %s and earns $%s per year.",
 *              record["Name"], record["Occupation"],
 *              record["Salary"]);
 * }
 * -------
 *
 * This module allows content to be iterated by record stored in a struct,
 * class, associative array, or as a range of fields. Upon detection of an
 * error an CSVException is thrown (can be disabled). csvNextToken has been
 * made public to allow for attempted recovery.
 *
 * Disabling exceptions will lift many restrictions specified above. A quote
 * can appear in a field if the field was not quoted. If in a quoted field any
 * quote by itself, not at the end of a field, will end processing for that
 * field. The field is ended when there is no input, even if the quote was not
 * closed.
 *
 *   See_Also:
 *      $(HTTP en.wikipedia.org/wiki/Comma-separated_values, Wikipedia
 *      Comma-separated values)
 *
 *   Copyright: Copyright 2011
 *   License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
 *   Authors:   Jesse Phillips
 *   Source:    $(PHOBOSSRC std/_csv.d)
 */
module std.csv;

import std.conv;
import std.exception;  // basicExceptionCtors
import std.range.primitives;
import std.traits;

/**
 * Exception containing the row and column for when an exception was thrown.
 *
 * Numbering of both row and col start at one and corresponds to the location
 * in the file rather than any specified header. Special consideration should
 * be made when there is failure to match the header see $(LREF
 * HeaderMismatchException) for details.
 *
 * When performing type conversions, $(REF ConvException, std,conv) is stored in
 * the $(D next) field.
 */
class CSVException : Exception
{
    ///
    size_t row, col;

    // FIXME: Use std.exception.basicExceptionCtors here once bug #11500 is fixed

    this(string msg, string file = __FILE__, size_t line = __LINE__,
         Throwable next = null) @nogc @safe pure nothrow
    {
        super(msg, file, line, next);
    }

    this(string msg, Throwable next, string file = __FILE__,
         size_t line = __LINE__) @nogc @safe pure nothrow
    {
        super(msg, file, line, next);
    }

    this(string msg, size_t row, size_t col, Throwable next = null,
         string file = __FILE__, size_t line = __LINE__) @nogc @safe pure nothrow
    {
        super(msg, next, file, line);
        this.row = row;
        this.col = col;
    }

    override string toString() @safe pure const
    {
        return "(Row: " ~ to!string(row) ~
              ", Col: " ~ to!string(col) ~ ") " ~ msg;
    }
}

@safe pure unittest
{
    import std.string;
    auto e1 = new Exception("Foobar");
    auto e2 = new CSVException("args", e1);
    assert(e2.next is e1);

    size_t r = 13;
    size_t c = 37;

    auto e3 = new CSVException("argv", r, c);
    assert(e3.row == r);
    assert(e3.col == c);

    auto em = e3.toString();
    assert(em.indexOf("13") != -1);
    assert(em.indexOf("37") != -1);
}

/**
 * Exception thrown when a Token is identified to not be completed: a quote is
 * found in an unquoted field, data continues after a closing quote, or the
 * quoted field was not closed before data was empty.
 */
class IncompleteCellException : CSVException
{
    /**
     * Data pulled from input before finding a problem
     *
     * This field is populated when using $(LREF csvReader)
     * but not by $(LREF csvNextToken) as this data will have
     * already been fed to the output range.
     */
    dstring partialData;

    mixin basicExceptionCtors;
}

@safe pure unittest
{
    auto e1 = new Exception("Foobar");
    auto e2 = new IncompleteCellException("args", e1);
    assert(e2.next is e1);
}

/**
 * Exception thrown under different conditions based on the type of $(D
 * Contents).
 *
 * Structure, Class, and Associative Array
 * $(UL
 *     $(LI When a header is provided but a matching column is not found)
 *  )
 *
 * Other
 * $(UL
 *     $(LI When a header is provided but a matching column is not found)
 *     $(LI Order did not match that found in the input)
 *  )
 *
 * Since a row and column is not meaningful when a column specified by the
 * header is not found in the data, both row and col will be zero. Otherwise
 * row is always one and col is the first instance found in header that
 * occurred before the previous starting at one.
 */
class HeaderMismatchException : CSVException
{
    mixin basicExceptionCtors;
}

@safe pure unittest
{
    auto e1 = new Exception("Foobar");
    auto e2 = new HeaderMismatchException("args", e1);
    assert(e2.next is e1);
}

/**
 * Determines the behavior for when an error is detected.
 *
 * Disabling exception will follow these rules:
 * $(UL
 *     $(LI A quote can appear in a field if the field was not quoted.)
 *     $(LI If in a quoted field any quote by itself, not at the end of a
 *     field, will end processing for that field.)
 *     $(LI The field is ended when there is no input, even if the quote was
 *     not closed.)
 *     $(LI If the given header does not match the order in the input, the
 *     content will return as it is found in the input.)
 *     $(LI If the given header contains columns not found in the input they
 *     will be ignored.)
 *  )
*/
enum Malformed
{
    ignore,           /// No exceptions are thrown due to incorrect CSV.
    throwException    /// Use exceptions when input has incorrect CSV.
}

/**
 * Returns an input range for iterating over records found in $(D
 * input).
 *
 * The $(D Contents) of the input can be provided if all the records are the
 * same type such as all integer data:
 *
 * -------
 * string str = `76,26,22`;
 * int[] ans = [76,26,22];
 * auto records = csvReader!int(str);
 *
 * foreach (record; records)
 * {
 *     assert(equal(record, ans));
 * }
 * -------
 *
 * Example using a struct with modified delimiter:
 *
 * -------
 * string str = "Hello;65;63.63\nWorld;123;3673.562";
 * struct Layout
 * {
 *     string name;
 *     int value;
 *     double other;
 * }
 *
 * auto records = csvReader!Layout(str,';');
 *
 * foreach (record; records)
 * {
 *     writeln(record.name);
 *     writeln(record.value);
 *     writeln(record.other);
 * }
 * -------
 *
 * Specifying $(D ErrorLevel) as Malformed.ignore will lift restrictions
 * on the format. This example shows that an exception is not thrown when
 * finding a quote in a field not quoted.
 *
 * -------
 * string str = "A \" is now part of the data";
 * auto records = csvReader!(string,Malformed.ignore)(str);
 * auto record = records.front;
 *
 * assert(record.front == str);
 * -------
 *
 * Returns:
 *        An input range R as defined by
 *        $(REF isInputRange, std,range,primitives). When $(D Contents) is a
 *        struct, class, or an associative array, the element type of R is
 *        $(D Contents), otherwise the element type of R is itself a range with
 *        element type $(D Contents).
 *
 * Throws:
 *       $(LREF CSVException) When a quote is found in an unquoted field,
 *       data continues after a closing quote, the quoted field was not
 *       closed before data was empty, a conversion failed, or when the row's
 *       length does not match the previous length.
 *
 *       $(LREF HeaderMismatchException)  when a header is provided but a
 *       matching column is not found or the order did not match that found in
 *       the input. Read the exception documentation for specific details of
 *       when the exception is thrown for different types of $(D Contents).
 */
auto csvReader(Contents = string,Malformed ErrorLevel = Malformed.throwException, Range, Separator = char)(Range input,
                 Separator delimiter = ',', Separator quote = '"')
if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar)
    && isSomeChar!(Separator)
    && !is(Contents T : T[U], U : string))
{
    return CsvReader!(Contents,ErrorLevel,Range,
                    Unqual!(ElementType!Range),string[])
        (input, delimiter, quote);
}

/**
 * An optional $(D header) can be provided. The first record will be read in
 * as the header. If $(D Contents) is a struct then the header provided is
 * expected to correspond to the fields in the struct. When $(D Contents) is
 * not a type which can contain the entire record, the $(D header) must be
 * provided in the same order as the input or an exception is thrown.
 *
 * Read only column "b":
 *
 * -------
 * string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
 * auto records = csvReader!int(str, ["b"]);
 *
 * auto ans = [[65],[123]];
 * foreach (record; records)
 * {
 *     assert(equal(record, ans.front));
 *     ans.popFront();
 * }
 * -------
 *
 * Read from header of different order:
 *
 * -------
 * string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
 * struct Layout
 * {
 *     int value;
 *     double other;
 *     string name;
 * }
 *
 * auto records = csvReader!Layout(str, ["b","c","a"]);
 * -------
 *
 * The header can also be left empty if the input contains a header but
 * all columns should be iterated. The header from the input can always
 * be accessed from the header field.
 *
 * -------
 * string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
 * auto records = csvReader(str, null);
 *
 * assert(records.header == ["a","b","c"]);
 * -------
 *
 * Returns:
 *        An input range R as defined by
 *        $(REF isInputRange, std,range,primitives). When $(D Contents) is a
 *        struct, class, or an associative array, the element type of R is
 *        $(D Contents), otherwise the element type of R is itself a range with
 *        element type $(D Contents).
 *
 *        The returned range provides a header field for accessing the header
 *        from the input in array form.
 *
 * -------
 * string str = "a,b,c\nHello,65,63.63";
 * auto records = csvReader(str, ["a"]);
 *
 * assert(records.header == ["a","b","c"]);
 * -------
 *
 * Throws:
 *       $(LREF CSVException) When a quote is found in an unquoted field,
 *       data continues after a closing quote, the quoted field was not
 *       closed before data was empty, a conversion failed, or when the row's
 *       length does not match the previous length.
 *
 *       $(LREF HeaderMismatchException)  when a header is provided but a
 *       matching column is not found or the order did not match that found in
 *       the input. Read the exception documentation for specific details of
 *       when the exception is thrown for different types of $(D Contents).
 */
auto csvReader(Contents = string,
               Malformed ErrorLevel = Malformed.throwException,
               Range, Header, Separator = char)
                (Range input, Header header,
                 Separator delimiter = ',', Separator quote = '"')
if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar)
    && isSomeChar!(Separator)
    && isForwardRange!Header
    && isSomeString!(ElementType!Header))
{
    return CsvReader!(Contents,ErrorLevel,Range,
                    Unqual!(ElementType!Range),Header)
        (input, header, delimiter, quote);
}

///
auto csvReader(Contents = string,
               Malformed ErrorLevel = Malformed.throwException,
               Range, Header, Separator = char)
                (Range input, Header header,
                 Separator delimiter = ',', Separator quote = '"')
if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar)
    && isSomeChar!(Separator)
    && is(Header : typeof(null)))
{
    return CsvReader!(Contents,ErrorLevel,Range,
                    Unqual!(ElementType!Range),string[])
        (input, cast(string[]) null, delimiter, quote);
}

// Test standard iteration over input.
@safe pure unittest
{
    string str = `one,"two ""quoted"""` ~ "\n\"three\nnew line\",\nfive,six";
    auto records = csvReader(str);

    int count;
    foreach (record; records)
    {
        foreach (cell; record)
        {
            count++;
        }
    }
    assert(count == 6);
}

// Test newline on last record
@safe pure unittest
{
    string str = "one,two\nthree,four\n";
    auto records = csvReader(str);
    records.popFront();
    records.popFront();
    assert(records.empty);
}

// Test shorter row length
@safe pure unittest
{
    wstring str = "one,1\ntwo\nthree"w;
    struct Layout
    {
        string name;
        int value;
    }

    Layout[3] ans;
    ans[0].name = "one";
    ans[0].value = 1;
    ans[1].name = "two";
    ans[1].value = 0;
    ans[2].name = "three";
    ans[2].value = 0;

    auto records = csvReader!(Layout,Malformed.ignore)(str);

    int count;
    foreach (record; records)
    {
        assert(ans[count].name == record.name);
        assert(ans[count].value == record.value);
        count++;
    }
}

// Test shorter row length exception
@safe pure unittest
{
    import std.exception;

    struct A
    {
        string a,b,c;
    }

    auto strs = ["one,1\ntwo",
                 "one\ntwo,2,二\nthree,3,三",
                 "one\ntwo,2\nthree,3",
                 "one,1\ntwo\nthree,3"];

    foreach (str; strs)
    {
        auto records = csvReader!A(str);
        assertThrown!CSVException((){foreach (record; records) { }}());
    }
}


// Test structure conversion interface with unicode.
@safe pure unittest
{
    import std.math : abs;

    wstring str = "\U00010143Hello,65,63.63\nWorld,123,3673.562"w;
    struct Layout
    {
        string name;
        int value;
        double other;
    }

    Layout[2] ans;
    ans[0].name = "\U00010143Hello";
    ans[0].value = 65;
    ans[0].other = 63.63;
    ans[1].name = "World";
    ans[1].value = 123;
    ans[1].other = 3673.562;

    auto records = csvReader!Layout(str);

    int count;
    foreach (record; records)
    {
        assert(ans[count].name == record.name);
        assert(ans[count].value == record.value);
        assert(abs(ans[count].other - record.other) < 0.00001);
        count++;
    }
    assert(count == ans.length);
}

// Test input conversion interface
@safe pure unittest
{
    import std.algorithm;
    string str = `76,26,22`;
    int[] ans = [76,26,22];
    auto records = csvReader!int(str);

    foreach (record; records)
    {
        assert(equal(record, ans));
    }
}

// Test struct & header interface and same unicode
@safe unittest
{
    import std.math : abs;

    string str = "a,b,c\nHello,65,63.63\n➊➋➂❹,123,3673.562";
    struct Layout
    {
        int value;
        double other;
        string name;
    }

    auto records = csvReader!Layout(str, ["b","c","a"]);

    Layout[2] ans;
    ans[0].name = "Hello";
    ans[0].value = 65;
    ans[0].other = 63.63;
    ans[1].name = "➊➋➂❹";
    ans[1].value = 123;
    ans[1].other = 3673.562;

    int count;
    foreach (record; records)
    {
        assert(ans[count].name == record.name);
        assert(ans[count].value == record.value);
        assert(abs(ans[count].other - record.other) < 0.00001);
        count++;
    }
    assert(count == ans.length);

}

// Test header interface
@safe unittest
{
    import std.algorithm;

    string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
    auto records = csvReader!int(str, ["b"]);

    auto ans = [[65],[123]];
    foreach (record; records)
    {
        assert(equal(record, ans.front));
        ans.popFront();
    }

    try
    {
        csvReader(str, ["c","b"]);
        assert(0);
    }
    catch (HeaderMismatchException e)
    {
        assert(e.col == 2);
    }
    auto records2 = csvReader!(string,Malformed.ignore)
       (str, ["b","a"], ',', '"');

    auto ans2 = [["Hello","65"],["World","123"]];
    foreach (record; records2)
    {
        assert(equal(record, ans2.front));
        ans2.popFront();
    }

    str = "a,c,e\nJoe,Carpenter,300000\nFred,Fly,4";
    records2 = csvReader!(string,Malformed.ignore)
       (str, ["a","b","c","d"], ',', '"');

    ans2 = [["Joe","Carpenter"],["Fred","Fly"]];
    foreach (record; records2)
    {
        assert(equal(record, ans2.front));
        ans2.popFront();
    }
}

// Test null header interface
@safe unittest
{
    string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
    auto records = csvReader(str, ["a"]);

    assert(records.header == ["a","b","c"]);
}

// Test unchecked read
@safe pure unittest
{
    string str = "one \"quoted\"";
    foreach (record; csvReader!(string,Malformed.ignore)(str))
    {
        foreach (cell; record)
        {
            assert(cell == "one \"quoted\"");
        }
    }

    str = "one \"quoted\",two \"quoted\" end";
    struct Ans
    {
        string a,b;
    }
    foreach (record; csvReader!(Ans,Malformed.ignore)(str))
    {
        assert(record.a == "one \"quoted\"");
        assert(record.b == "two \"quoted\" end");
    }
}

// Test partial data returned
@safe pure unittest
{
    string str = "\"one\nnew line";

    try
    {
        foreach (record; csvReader(str))
        {}
        assert(0);
    }
    catch (IncompleteCellException ice)
    {
        assert(ice.partialData == "one\nnew line");
    }
}

// Test Windows line break
@safe pure unittest
{
    string str = "one,two\r\nthree";

    auto records = csvReader(str);
    auto record = records.front;
    assert(record.front == "one");
    record.popFront();
    assert(record.front == "two");
    records.popFront();
    record = records.front;
    assert(record.front == "three");
}


// Test associative array support with unicode separator
@safe unittest
{
  string str = "1❁2❁3\n34❁65❁63\n34❁65❁63";

  auto records = csvReader!(string[string])(str,["3","1"],'❁');
  int count;
  foreach (record; records)
  {
      count++;
      assert(record["1"] == "34");
      assert(record["3"] == "63");
  }
  assert(count == 2);
}

// Test restricted range
@safe unittest
{
    import std.typecons;
    struct InputRange
    {
        dstring text;

        this(dstring txt)
        {
            text = txt;
        }

        @property auto empty()
        {
            return text.empty;
        }

        void popFront()
        {
            text.popFront();
        }

        @property dchar front()
        {
            return text[0];
        }
    }
    auto ir = InputRange("Name,Occupation,Salary\r"d~
          "Joe,Carpenter,300000\nFred,Blacksmith,400000\r\n"d);

    foreach (record; csvReader(ir, cast(string[]) null))
        foreach (cell; record) {}
    foreach (record; csvReader!(Tuple!(string, string, int))
            (ir,cast(string[]) null)) {}
    foreach (record; csvReader!(string[string])
            (ir,cast(string[]) null)) {}
}

@safe unittest // const/immutable dchars
{
    import std.algorithm.iteration : map;
    import std.array : array;
    const(dchar)[] c = "foo,bar\n";
    assert(csvReader(c).map!array.array == [["foo", "bar"]]);
    immutable(dchar)[] i = "foo,bar\n";
    assert(csvReader(i).map!array.array == [["foo", "bar"]]);
}

/*
 * This struct is stored on the heap for when the structures
 * are passed around.
 */
private pure struct Input(Range, Malformed ErrorLevel)
{
    Range range;
    size_t row, col;
    static if (ErrorLevel == Malformed.throwException)
        size_t rowLength;
}

/*
 * Range for iterating CSV records.
 *
 * This range is returned by the $(LREF csvReader) functions. It can be
 * created in a similar manner to allow $(D ErrorLevel) be set to $(LREF
 * Malformed).ignore if best guess processing should take place.
 */
private struct CsvReader(Contents, Malformed ErrorLevel, Range, Separator, Header)
if (isSomeChar!Separator && isInputRange!Range
    && is(Unqual!(ElementType!Range) == dchar)
    && isForwardRange!Header && isSomeString!(ElementType!Header))
{
private:
    Input!(Range, ErrorLevel)* _input;
    Separator _separator;
    Separator _quote;
    size_t[] indices;
    bool _empty;
    static if (is(Contents == struct) || is(Contents == class))
    {
        Contents recordContent;
        CsvRecord!(string, ErrorLevel, Range, Separator) recordRange;
    }
    else static if (is(Contents T : T[U], U : string))
    {
        Contents recordContent;
        CsvRecord!(T, ErrorLevel, Range, Separator) recordRange;
    }
    else
        CsvRecord!(Contents, ErrorLevel, Range, Separator) recordRange;
public:
    /**
     * Header from the input in array form.
     *
     * -------
     * string str = "a,b,c\nHello,65,63.63";
     * auto records = csvReader(str, ["a"]);
     *
     * assert(records.header == ["a","b","c"]);
     * -------
     */
    string[] header;

    /**
     * Constructor to initialize the input, delimiter and quote for input
     * without a header.
     *
     * -------
     * string str = `76;^26^;22`;
     * int[] ans = [76,26,22];
     * auto records = CsvReader!(int,Malformed.ignore,string,char,string[])
     *       (str, ';', '^');
     *
     * foreach (record; records)
     * {
     *    assert(equal(record, ans));
     * }
     * -------
     */
    this(Range input, Separator delimiter, Separator quote)
    {
        _input = new Input!(Range, ErrorLevel)(input);
        _separator = delimiter;
        _quote = quote;

        prime();
    }

    /**
     * Constructor to initialize the input, delimiter and quote for input
     * with a header.
     *
     * -------
     * string str = `high;mean;low\n76;^26^;22`;
     * auto records = CsvReader!(int,Malformed.ignore,string,char,string[])
     *       (str, ["high","low"], ';', '^');
     *
     * int[] ans = [76,22];
     * foreach (record; records)
     * {
     *    assert(equal(record, ans));
     * }
     * -------
     *
     * Throws:
     *       $(LREF HeaderMismatchException)  when a header is provided but a
     *       matching column is not found or the order did not match that found
     *       in the input (non-struct).
     */
    this(Range input, Header colHeaders, Separator delimiter, Separator quote)
    {
        _input = new Input!(Range, ErrorLevel)(input);
        _separator = delimiter;
        _quote = quote;

        size_t[string] colToIndex;
        foreach (h; colHeaders)
        {
            colToIndex[h] = size_t.max;
        }

        auto r = CsvRecord!(string, ErrorLevel, Range, Separator)
            (_input, _separator, _quote, indices);

        size_t colIndex;
        foreach (col; r)
        {
            header ~= col;
            auto ptr = col in colToIndex;
            if (ptr)
                *ptr = colIndex;
            colIndex++;
        }
        // The above loop empties the header row.
        recordRange._empty = true;

        indices.length = colToIndex.length;
        int i;
        foreach (h; colHeaders)
        {
            immutable index = colToIndex[h];
            static if (ErrorLevel != Malformed.ignore)
                if (index == size_t.max)
                    throw new HeaderMismatchException
                        ("Header not found: " ~ to!string(h));
            indices[i++] = index;
        }

        static if (!is(Contents == struct) && !is(Contents == class))
        {
            static if (is(Contents T : T[U], U : string))
            {
                import std.algorithm.sorting : sort;
                sort(indices);
            }
            else static if (ErrorLevel == Malformed.ignore)
            {
                import std.algorithm.sorting : sort;
                sort(indices);
            }
            else
            {
                import std.algorithm.searching : findAdjacent;
                import std.algorithm.sorting : isSorted;
                if (!isSorted(indices))
                {
                    auto ex = new HeaderMismatchException
                           ("Header in input does not match specified header.");
                    findAdjacent!"a > b"(indices);
                    ex.row = 1;
                    ex.col = indices.front;

                    throw ex;
                }
            }
        }

        popFront();
    }

    /**
     * Part of an input range as defined by
     * $(REF isInputRange, std,range,primitives).
     *
     * Returns:
     *      If $(D Contents) is a struct, will be filled with record data.
     *
     *      If $(D Contents) is a class, will be filled with record data.
     *
     *      If $(D Contents) is a associative array, will be filled
     *      with record data.
     *
     *      If $(D Contents) is non-struct, a $(LREF CsvRecord) will be
     *      returned.
     */
    @property auto front()
    {
        assert(!empty);
        static if (is(Contents == struct) || is(Contents == class))
        {
            return recordContent;
        }
        else static if (is(Contents T : T[U], U : string))
        {
            return recordContent;
        }
        else
        {
            return recordRange;
        }
    }

    /**
     * Part of an input range as defined by
     * $(REF isInputRange, std,range,primitives).
     */
    @property bool empty() @safe @nogc pure nothrow const
    {
        return _empty;
    }

    /**
     * Part of an input range as defined by
     * $(REF isInputRange, std,range,primitives).
     *
     * Throws:
     *       $(LREF CSVException) When a quote is found in an unquoted field,
     *       data continues after a closing quote, the quoted field was not
     *       closed before data was empty, a conversion failed, or when the
     *       row's length does not match the previous length.
     */
    void popFront()
    {
        while (!recordRange.empty)
        {
            recordRange.popFront();
        }

        static if (ErrorLevel == Malformed.throwException)
            if (_input.rowLength == 0)
                _input.rowLength = _input.col;

        _input.col = 0;

        if (!_input.range.empty)
        {
            if (_input.range.front == '\r')
            {
                _input.range.popFront();
                if (!_input.range.empty && _input.range.front == '\n')
                    _input.range.popFront();
            }
            else if (_input.range.front == '\n')
                _input.range.popFront();
        }

        if (_input.range.empty)
        {
            _empty = true;
            return;
        }

        prime();
    }

    private void prime()
    {
        if (_empty)
            return;
        _input.row++;
        static if (is(Contents == struct) || is(Contents == class))
        {
            recordRange = typeof(recordRange)
                                 (_input, _separator, _quote, null);
        }
        else
        {
            recordRange = typeof(recordRange)
                                 (_input, _separator, _quote, indices);
        }

        static if (is(Contents T : T[U], U : string))
        {
            T[U] aa;
            try
            {
                for (; !recordRange.empty; recordRange.popFront())
                {
                    aa[header[_input.col-1]] = recordRange.front;
                }
            }
            catch (ConvException e)
            {
                throw new CSVException(e.msg, _input.row, _input.col, e);
            }

            recordContent = aa;
        }
        else static if (is(Contents == struct) || is(Contents == class))
        {
            static if (is(Contents == class))
                recordContent = new typeof(recordContent)();
            else
                recordContent = typeof(recordContent).init;
            size_t colIndex;
            try
            {
                for (; !recordRange.empty;)
                {
                    auto colData = recordRange.front;
                    scope(exit) colIndex++;
                    if (indices.length > 0)
                    {
                        foreach (ti, ToType; Fields!(Contents))
                        {
                            if (indices[ti] == colIndex)
                            {
                                static if (!isSomeString!ToType) skipWS(colData);
                                recordContent.tupleof[ti] = to!ToType(colData);
                            }
                        }
                    }
                    else
                    {
                        foreach (ti, ToType; Fields!(Contents))
                        {
                            if (ti == colIndex)
                            {
                                static if (!isSomeString!ToType) skipWS(colData);
                                recordContent.tupleof[ti] = to!ToType(colData);
                            }
                        }
                    }
                    recordRange.popFront();
                }
            }
            catch (ConvException e)
            {
                throw new CSVException(e.msg, _input.row, colIndex, e);
            }
        }
    }
}

@safe pure unittest
{
    import std.algorithm.comparison : equal;

    string str = `76;^26^;22`;
    int[] ans = [76,26,22];
    auto records = CsvReader!(int,Malformed.ignore,string,char,string[])
          (str, ';', '^');

    foreach (record; records)
    {
        assert(equal(record, ans));
    }
}

// Bugzilla 15545
// @system due to the catch for Throwable
@system pure unittest
{
    import std.exception : assertNotThrown;
    enum failData =
    "name, surname, age
    Joe, Joker, 99\r";
    auto r = csvReader(failData);
    assertNotThrown((){foreach (entry; r){}}());
}

/*
 * This input range is accessible through $(LREF CsvReader) when the
 * requested $(D Contents) type is neither a structure or an associative array.
 */
private struct CsvRecord(Contents, Malformed ErrorLevel, Range, Separator)
if (!is(Contents == class) && !is(Contents == struct))
{
    import std.array : appender;
private:
    Input!(Range, ErrorLevel)* _input;
    Separator _separator;
    Separator _quote;
    Contents curContentsoken;
    typeof(appender!(dchar[])()) _front;
    bool _empty;
    size_t[] _popCount;
public:
    /*
     * Params:
     *      input = Pointer to a character input range
     *      delimiter = Separator for each column
     *      quote = Character used for quotation
     *      indices = An array containing which columns will be returned.
     *             If empty, all columns are returned. List must be in order.
     */
    this(Input!(Range, ErrorLevel)* input, Separator delimiter,
         Separator quote, size_t[] indices)
    {
        _input = input;
        _separator = delimiter;
        _quote = quote;
        _front = appender!(dchar[])();
        _popCount = indices.dup;

        // If a header was given, each call to popFront will need
        // to eliminate so many tokens. This calculates
        // how many will be skipped to get to the next header column
        size_t normalizer;
        foreach (ref c; _popCount)
        {
            static if (ErrorLevel == Malformed.ignore)
            {
                // If we are not throwing exceptions
                // a header may not exist, indices are sorted
                // and will be size_t.max if not found.
                if (c == size_t.max)
                    break;
            }
            c -= normalizer;
            normalizer += c + 1;
        }

        prime();
    }

    /**
     * Part of an input range as defined by
     * $(REF isInputRange, std,range,primitives).
     */
    @property Contents front() @safe pure
    {
        assert(!empty);
        return curContentsoken;
    }

    /**
     * Part of an input range as defined by
     * $(REF isInputRange, std,range,primitives).
     */
    @property bool empty() @safe pure nothrow @nogc const
    {
        return _empty;
    }

    /*
     * CsvRecord is complete when input
     * is empty or starts with record break
     */
    private bool recordEnd()
    {
        if (_input.range.empty
           || _input.range.front == '\n'
           || _input.range.front == '\r')
        {
            return true;
        }
        return false;
    }


    /**
     * Part of an input range as defined by
     * $(REF isInputRange, std,range,primitives).
     *
     * Throws:
     *       $(LREF CSVException) When a quote is found in an unquoted field,
     *       data continues after a closing quote, the quoted field was not
     *       closed before data was empty, a conversion failed, or when the
     *       row's length does not match the previous length.
     */
    void popFront()
    {
        static if (ErrorLevel == Malformed.throwException)
            import std.format : format;
        // Skip last of record when header is depleted.
        if (_popCount.ptr && _popCount.empty)
            while (!recordEnd())
            {
                prime(1);
            }

        if (recordEnd())
        {
            _empty = true;
            static if (ErrorLevel == Malformed.throwException)
                if (_input.rowLength != 0)
                    if (_input.col != _input.rowLength)
                        throw new CSVException(
                           format("Row %s's length %s does not match "~
                                  "previous length of %s.", _input.row,
                                  _input.col, _input.rowLength));
            return;
        }
        else
        {
            static if (ErrorLevel == Malformed.throwException)
                if (_input.rowLength != 0)
                    if (_input.col > _input.rowLength)
                        throw new CSVException(
                           format("Row %s's length %s does not match "~
                                  "previous length of %s.", _input.row,
                                  _input.col, _input.rowLength));
        }

        // Separator is left on the end of input from the last call.
        // This cannot be moved to after the call to csvNextToken as
        // there may be an empty record after it.
        if (_input.range.front == _separator)
            _input.range.popFront();

        _front.shrinkTo(0);

        prime();
    }

    /*
     * Handles moving to the next skipNum token.
     */
    private void prime(size_t skipNum)
    {
        foreach (i; 0 .. skipNum)
        {
            _input.col++;
            _front.shrinkTo(0);
            if (_input.range.front == _separator)
                _input.range.popFront();

            try
                csvNextToken!(Range, ErrorLevel, Separator)
                                   (_input.range, _front, _separator, _quote,false);
            catch (IncompleteCellException ice)
            {
                ice.row = _input.row;
                ice.col = _input.col;
                ice.partialData = _front.data.idup;
                throw ice;
            }
            catch (ConvException e)
            {
                throw new CSVException(e.msg, _input.row, _input.col, e);
            }
        }
    }

    private void prime()
    {
        try
        {
            _input.col++;
            csvNextToken!(Range, ErrorLevel, Separator)
                (_input.range, _front, _separator, _quote,false);
        }
        catch (IncompleteCellException ice)
        {
            ice.row = _input.row;
            ice.col = _input.col;
            ice.partialData = _front.data.idup;
            throw ice;
        }

        auto skipNum = _popCount.empty ? 0 : _popCount.front;
        if (!_popCount.empty)
            _popCount.popFront();

        if (skipNum == size_t.max)
        {
            while (!recordEnd())
                prime(1);
            _empty = true;
            return;
        }

        if (skipNum)
            prime(skipNum);

        auto data = _front.data;
        static if (!isSomeString!Contents) skipWS(data);
        try curContentsoken = to!Contents(data);
        catch (ConvException e)
        {
            throw new CSVException(e.msg, _input.row, _input.col, e);
        }
    }
}

/**
 * Lower level control over parsing CSV
 *
 * This function consumes the input. After each call the input will
 * start with either a delimiter or record break (\n, \r\n, \r) which
 * must be removed for subsequent calls.
 *
 * Params:
 *       input = Any CSV input
 *       ans   = The first field in the input
 *       sep   = The character to represent a comma in the specification
 *       quote = The character to represent a quote in the specification
 *       startQuoted = Whether the input should be considered to already be in
 * quotes
 *
 * Throws:
 *       $(LREF IncompleteCellException) When a quote is found in an unquoted
 *       field, data continues after a closing quote, or the quoted field was
 *       not closed before data was empty.
 */
void csvNextToken(Range, Malformed ErrorLevel = Malformed.throwException,
                           Separator, Output)
                          (ref Range input, ref Output ans,
                           Separator sep, Separator quote,
                           bool startQuoted = false)
if (isSomeChar!Separator && isInputRange!Range
    && is(Unqual!(ElementType!Range) == dchar)
    && isOutputRange!(Output, dchar))
{
    bool quoted = startQuoted;
    bool escQuote;
    if (input.empty)
        return;

    if (input.front == '\n')
        return;
    if (input.front == '\r')
        return;

    if (input.front == quote)
    {
        quoted = true;
        input.popFront();
    }

    while (!input.empty)
    {
        assert(!(quoted && escQuote));
        if (!quoted)
        {
            // When not quoted the token ends at sep
            if (input.front == sep)
                break;
            if (input.front == '\r')
                break;
            if (input.front == '\n')
                break;
        }
        if (!quoted && !escQuote)
        {
            if (input.front == quote)
            {
                // Not quoted, but quote found
                static if (ErrorLevel == Malformed.throwException)
                    throw new IncompleteCellException(
                          "Quote located in unquoted token");
                else static if (ErrorLevel == Malformed.ignore)
                    ans.put(quote);
            }
            else
            {
                // Not quoted, non-quote character
                ans.put(input.front);
            }
        }
        else
        {
            if (input.front == quote)
            {
                // Quoted, quote found
                // By turning off quoted and turning on escQuote
                // I can tell when to add a quote to the string
                // escQuote is turned to false when it escapes a
                // quote or is followed by a non-quote (see outside else).
                // They are mutually exclusive, but provide different
                // information.
                if (escQuote)
                {
                    escQuote = false;
                    quoted = true;
                    ans.put(quote);
                } else
                {
                    escQuote = true;
                    quoted = false;
                }
            }
            else
            {
                // Quoted, non-quote character
                if (escQuote)
                {
                    static if (ErrorLevel == Malformed.throwException)
                        throw new IncompleteCellException(
                          "Content continues after end quote, " ~
                          "or needs to be escaped.");
                    else static if (ErrorLevel == Malformed.ignore)
                        break;
                }
                ans.put(input.front);
            }
        }
        input.popFront();
    }

    static if (ErrorLevel == Malformed.throwException)
        if (quoted && (input.empty || input.front == '\n' || input.front == '\r'))
            throw new IncompleteCellException(
                  "Data continues on future lines or trailing quote");

}

///
@safe unittest
{
    import std.array : appender;
    import std.range.primitives : popFront;

    string str = "65,63\n123,3673";

    auto a = appender!(char[])();

    csvNextToken(str,a,',','"');
    assert(a.data == "65");
    assert(str == ",63\n123,3673");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "63");
    assert(str == "\n123,3673");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "123");
    assert(str == ",3673");
}

// Test csvNextToken on simplest form and correct format.
@safe pure unittest
{
    import std.array;

    string str = "\U00010143Hello,65,63.63\nWorld,123,3673.562";

    auto a = appender!(dchar[])();
    csvNextToken!string(str,a,',','"');
    assert(a.data == "\U00010143Hello");
    assert(str == ",65,63.63\nWorld,123,3673.562");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "65");
    assert(str == ",63.63\nWorld,123,3673.562");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "63.63");
    assert(str == "\nWorld,123,3673.562");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "World");
    assert(str == ",123,3673.562");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "123");
    assert(str == ",3673.562");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "3673.562");
    assert(str == "");
}

// Test quoted tokens
@safe pure unittest
{
    import std.array;

    string str = `one,two,"three ""quoted""","",` ~ "\"five\nnew line\"\nsix";

    auto a = appender!(dchar[])();
    csvNextToken!string(str,a,',','"');
    assert(a.data == "one");
    assert(str == `,two,"three ""quoted""","",` ~ "\"five\nnew line\"\nsix");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "two");
    assert(str == `,"three ""quoted""","",` ~ "\"five\nnew line\"\nsix");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "three \"quoted\"");
    assert(str == `,"",` ~ "\"five\nnew line\"\nsix");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "");
    assert(str == ",\"five\nnew line\"\nsix");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "five\nnew line");
    assert(str == "\nsix");

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "six");
    assert(str == "");
}

// Test empty data is pulled at end of record.
@safe pure unittest
{
    import std.array;

    string str = "one,";
    auto a = appender!(dchar[])();
    csvNextToken(str,a,',','"');
    assert(a.data == "one");
    assert(str == ",");

    a.shrinkTo(0);
    csvNextToken(str,a,',','"');
    assert(a.data == "");
}

// Test exceptions
@safe pure unittest
{
    import std.array;

    string str = "\"one\nnew line";

    typeof(appender!(dchar[])()) a;
    try
    {
        a = appender!(dchar[])();
        csvNextToken(str,a,',','"');
        assert(0);
    }
    catch (IncompleteCellException ice)
    {
        assert(a.data == "one\nnew line");
        assert(str == "");
    }

    str = "Hello world\"";

    try
    {
        a = appender!(dchar[])();
        csvNextToken(str,a,',','"');
        assert(0);
    }
    catch (IncompleteCellException ice)
    {
        assert(a.data == "Hello world");
        assert(str == "\"");
    }

    str = "one, two \"quoted\" end";

    a = appender!(dchar[])();
    csvNextToken!(string,Malformed.ignore)(str,a,',','"');
    assert(a.data == "one");
    str.popFront();
    a.shrinkTo(0);
    csvNextToken!(string,Malformed.ignore)(str,a,',','"');
    assert(a.data == " two \"quoted\" end");
}

// Test modifying token delimiter
@safe pure unittest
{
    import std.array;

    string str = `one|two|/three "quoted"/|//`;

    auto a = appender!(dchar[])();
    csvNextToken(str,a, '|','/');
    assert(a.data == "one"d);
    assert(str == `|two|/three "quoted"/|//`);

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a, '|','/');
    assert(a.data == "two"d);
    assert(str == `|/three "quoted"/|//`);

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a, '|','/');
    assert(a.data == `three "quoted"`);
    assert(str == `|//`);

    str.popFront();
    a.shrinkTo(0);
    csvNextToken(str,a, '|','/');
    assert(a.data == ""d);
}

// Bugzilla 8908
@safe pure unittest
{
    string csv = `  1.0, 2.0, 3.0
                    4.0, 5.0, 6.0`;

    static struct Data { real a, b, c; }
    size_t i = 0;
    foreach (data; csvReader!Data(csv)) with (data)
    {
        int[] row = [cast(int) a, cast(int) b, cast(int) c];
        if (i == 0)
            assert(row == [1, 2, 3]);
        else
            assert(row == [4, 5, 6]);
        ++i;
    }

    i = 0;
    foreach (data; csvReader!real(csv))
    {
        auto a = data.front;    data.popFront();
        auto b = data.front;    data.popFront();
        auto c = data.front;
        int[] row = [cast(int) a, cast(int) b, cast(int) c];
        if (i == 0)
            assert(row == [1, 2, 3]);
        else
            assert(row == [4, 5, 6]);
        ++i;
    }
}
author	Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date	Mon, 25 May 2020 18:13:55 +0900
parents	1830386684a0
children