File: bylinefast.d

package info (click to toggle)
sambamba 1.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 3,528 kB
  • sloc: sh: 220; python: 166; ruby: 147; makefile: 103
file content (142 lines) | stat: -rw-r--r-- 4,851 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/// Kudos to Juan Manuel Cabo
/// http://forum.dlang.org/post/cddkatcqmdtibcmfljff@forum.dlang.org
///
/// This piece of code is in public domain.
module bio.core.utils.bylinefast;

import std.stdio;
import std.string: indexOf;
import core.stdc.string: memmove;

/**
   Reads by line in an efficient way (10 times faster than File.byLine from std.stdio).
   This is accomplished by reading entire buffers (fgetc() is not used),
   and allocating as little as possible.

   The char \n is considered as separator, removing the previous \r if it exists.

   The \n is never returned. The \r is not returned if it was
   part of a \r\n (but it is returned if it was by itself).

   The returned string is always a substring of a temporary
   buffer, that must not be stored. If necessary, you must
   use str[] or .dup or .idup to copy to another string.

   Example:

   File f = File("file.txt");
   foreach (string line; ByLineFast(f)) {
       ...process line...
       //Make a copy:
       string copy = line[];
   }

   The file isn't closed when done iterating, unless it was the only reference to
   the file (same as std.stdio.byLine). (example: ByLineFast(File("file.txt"))).
*/
struct ByLineFast {
    File file;
    char[] line;
    bool first_call = true;
    char[] buffer;
    char[] strBuffer;

    this(File f, int bufferSize=4096) {
        assert(bufferSize > 0);
        file = f;
        buffer.length = bufferSize;
    }

    @property bool empty() const {
        //Its important to check "line !is null" instead of
        //"line.length != 0", otherwise, no empty lines can
        //be returned, the iteration would be closed.
        if (line !is null) {
            return false;
        }
        if (!file.isOpen) {
            //Clean the buffer to avoid pointer false positives:
            (cast(char[])buffer)[] = 0;
            return true;
        }

        //First read. Determine if it's empty and put the char back.
            auto mutableFP = (cast(File*) &file).getFP();
        auto c = fgetc(mutableFP);
        if (c == -1) {
            //Clean the buffer to avoid pointer false positives:
            (cast(char[])buffer)[] = 0;
            return true;
        }
        if (ungetc(c, mutableFP) != c) {
            assert(false, "Bug in cstdlib implementation");
        }
        return false;
    }

    @property char[] front() {
        if (first_call) {
            popFront();
            first_call = false;
        }
        return line;
    }

    void popFront() {
        if (strBuffer.length == 0) {
            strBuffer = file.rawRead(buffer);
            if (strBuffer.length == 0) {
                file.detach();
                line = null;
                return;
            }
        }

        long pos = strBuffer.indexOf('\n');
        if (pos != -1) {
            if (pos != 0 && strBuffer[cast(size_t)pos-1] == '\r') {
                line = strBuffer[0 .. cast(size_t)(pos-1)];
            } else {
                line = strBuffer[0 .. cast(size_t)pos];
            }
            //Pop the line, skipping the terminator:
            strBuffer = strBuffer[cast(size_t)(pos+1) .. $];
        } else {
            //More needs to be read here. Copy the tail of the buffer
            //to the beginning, and try to read with the empty part of
            //the buffer.
            //If no buffer was left, extend the size of the buffer before
            //reading. If the file has ended, then the line is the entire
            //buffer.

            if (strBuffer.ptr != buffer.ptr) {
                //Must use memmove because there might be overlap
                memmove(buffer.ptr, strBuffer.ptr,
                        strBuffer.length * char.sizeof);
            }
            auto spaceBegin = strBuffer.length;
            if (strBuffer.length == buffer.length) {
                //Must extend the buffer to keep reading.
                assumeSafeAppend(buffer);
                buffer.length = buffer.length * 2;
            }
            char[] readPart = file.rawRead(buffer[spaceBegin .. $]);
            if (readPart.length == 0) {
                //End of the file. Return whats in the buffer.
                //The next popFront() will try to read again, and then
                //mark empty condition.
                if (spaceBegin != 0 && buffer[spaceBegin-1] == '\r') {
                    line = buffer[0 .. spaceBegin-1];
                } else {
                    line = buffer[0 .. spaceBegin];
                }
                strBuffer = null;
                return;
            }
            strBuffer = buffer[0 .. spaceBegin + readPart.length];
            //Now that we have new data in strBuffer, we can go on.
            //If a line isn't found, the buffer will be extended again to read more.
            popFront();
        }
    }
}