File: do_file.c

package info (click to toggle)
swish++ 6.1.5-2
  • links: PTS
  • area: main
  • in suites: squeeze
  • size: 2,256 kB
  • ctags: 1,759
  • sloc: ansic: 11,931; lisp: 804; sh: 629; perl: 366; makefile: 80
file content (266 lines) | stat: -rw-r--r-- 8,302 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
/*
**      SWISH++
**      do_file.c
**
**      Copyright (C) 1998  Paul J. Lucas
**
**      This program is free software; you can redistribute it and/or modify
**      it under the terms of the GNU General Public License as published by
**      the Free Software Foundation; either version 2 of the License, or
**      (at your option) any later version.
**
**      This program is distributed in the hope that it will be useful,
**      but WITHOUT ANY WARRANTY; without even the implied warranty of
**      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**      GNU General Public License for more details.
**
**      You should have received a copy of the GNU General Public License
**      along with this program; if not, write to the Free Software
**      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

/*
**      Note that this file is #include'd into index.c and extract.c because
**      it generates different code depending on which one it's compiled into.
*/

// standard
#include <vector>

// local
#include "encoded_char.h"

//*****************************************************************************
//
// SYNOPSIS
//
#ifdef  INDEX
        void do_file( char const *file_name, int dir_index )
#else
        void do_file( char const *file_name )
#endif
//
// DESCRIPTION
//
//      Either index or extract text from the given file, but only if its
//      extension is among (not among) the specified set.  It will not follow
//      symbolic links unless either the FollowLinks config. file variable or
//      the -l command-line option was given.
//
//      For extraction, the algorithm is loosely based on what the Unix
//      strings(1) command does except it goes a bit further to discard things
//      like Encapsulated PostScript and raw hex data.
//
// PARAMETERS
//
//      file_name   The file to process.
//
//*****************************************************************************
{
    char const *const orig_base_name = pjl_basename( file_name );

    ++num_examined_files;
    if ( verbosity > 3 )                        // print base name of file
        cout << "  " << orig_base_name << flush;

    ////////// Simple checks to see if we should process the file /////////////

    if ( !is_plain_file() ) {
        //
        // We're able to use the zero-argument form of is_plain_file() because
        // the stat_buf is cached by the call to file_exists() in both index.c
        // and extract.c just before the call to do_file().
        //
        if ( verbosity > 3 )
            cout << " (skipped: not plain file)\n";
        return;
    }

#ifdef  INDEX
    //
    // Record the size of the original (non-filtered) file here before we call
    // is_symbolic_link() below.  This is the size that is stored in the index.
    //
    off_t const orig_file_size = file_size();
#endif  /* INDEX */

#ifndef PJL_NO_SYMBOLIC_LINKS
    if ( is_symbolic_link( file_name ) && !follow_symbolic_links ) {
        //
        // Despite the above comment for is_plain_file(), we have to use the
        // one-argument form is is_symbolic_link() because we need to call
        // lstat(2) rather than stat(2).
        //
        if ( verbosity > 3 )
            cout << " (skipped: symbolic link)\n";
        return;
    }
#endif  /* PJL_NO_SYMBOLIC_LINKS */

#ifdef  INDEX
    //
    // If incrementally indexing, it's possible that we've encountered the file
    // before.
    //
    if ( incremental && file_info::seen_file( file_name ) ) {
        if ( verbosity > 3 )
            cout << " (skipped: encountered before)\n";
        return;
    }
#endif  /* INDEX */

    ////////// Perform filter name substitution(s) ////////////////////////////

    typedef vector< filter > filter_list_type;
    filter_list_type filter_list;
#ifdef  INDEX
    char const *const orig_file_name = file_name;
#endif

    while ( true ) {
        //
        // Determine if the file needs to be filtered and, if so, set the
        // filename to what it would become if it were filtered.
        //
        FilterFile::const_pointer const f = file_filters[ file_name ];
        if ( !f )
            break;
        filter_list.push_back( *f );
        file_name = filter_list.back().substitute( file_name );
    }
    char const *const base_name = pjl_basename( file_name );

    //
    // Skip the file if it matches one of the set of unacceptable patterns.
    //
    if ( exclude_patterns.matches( base_name ) ) {
        if ( verbosity > 3 )
            cout << " (skipped: file excluded)\n";
        return;
    }

    //
    // See if the filename pattern is included.
    //
#ifdef  INDEX
    IncludeFile::const_iterator const
#else
    ExtractFile::const_iterator const
#endif
        include_pattern = include_patterns.find( base_name );
    //
    // Skip the file if the set of acceptable patterns doesn't contain the
    // candidate, but only if there was at least one acceptable pattern
    // specified.
    //
    bool const found_pattern = include_pattern != include_patterns.end();
    if ( !include_patterns.empty() && !found_pattern ) {
        if ( verbosity > 3 )
            cout << " (skipped: file not included)\n";
        return;
    }

#ifdef  EXTRACT
    ostream *out;
    ofstream extracted_file;
    if ( extract_as_filter ) {
        //
        // We're running as a filter: write to standard output.
        //
        out = &cout;
    } else {
        //
        // We're not running as a filter: check to see if the extracted file
        // already exists; if so, skip extraction entirely.
        //
        if ( ::strlen( file_name ) + extract_extension.length() > PATH_MAX ) {
            if ( verbosity > 3 )
                cout << " (skipped: " << extract_extension
                     << " file-name too long)\n";
            return;
        }
        char extracted_file_name[ PATH_MAX + 1 ];
        ::strcpy( extracted_file_name, file_name );
        ::strcat( extracted_file_name, extract_extension );
        if ( file_exists( extracted_file_name ) ) {
            if ( verbosity > 3 )
                cout << " (skipped: " << extract_extension
                     << " file already exists)\n";
            return;
        }

        extracted_file.open( extracted_file_name );
        if ( !extracted_file ) {
            if ( verbosity > 3 )
                cout << " (skipped: can not create "
                     << extract_extension << " file)\n";
            return;
        }
        out = &extracted_file;
    }
#endif  /* EXTRACT */

    //
    // Execute the filter(s) on the file.
    //
    FOR_EACH( filter_list_type, filter_list, f )
        if ( !( file_name = f->exec() ) ) {
            if ( verbosity > 3 )
                cout << " (skipped: could not filter)\n";
            return;
        }

    //
    // We can (finally!) open the (possibly post-filtered) file.
    //
    mmap_file const file( file_name );
    if ( !file ) {
        if ( verbosity > 3 )
            cout << " (skipped: can not open)\n";
        return;
    }
    file.behavior( mmap_file::bt_sequential );

    if ( verbosity == 3 )                       // print base name of file
        cout << "  " << orig_base_name << flush;

#ifdef  INDEX
    if ( file.empty() ) {
        //
        // Don't waste a file_info entry on it.
        //
        if ( verbosity > 2 )
            cout << " (0 words)\n";
        return;
    }

    ////////// Index the file /////////////////////////////////////////////////

#ifdef  IMPLEMENT_DECODING
    encoded_char_range::decoder::reset_all();
#endif
    indexer *const i = found_pattern ?
        include_pattern->second : indexer::text_indexer();
    file_info *const fi = new file_info(
        orig_file_name, dir_index, orig_file_size, i->find_title( file )
    );
#ifdef  FEATURE_word_pos
    word_pos = 0;
#endif
    i->index_file( file );

    if ( verbosity > 2 )
        cout << " (" << fi->num_words() << " words)\n";

    if ( words.size() >= word_threshold )
        write_partial_index();
#endif  /* INDEX */

#ifdef  EXTRACT
    ////////// Extract the file ///////////////////////////////////////////////

    ++num_extracted_files;
    extract_words( file.begin(), file.end(), *out );
#endif  /* EXTRACT */
}
/* vim:set et sw=4 ts=4: */