File: marc_importer.rb

package info (click to toggle)
lucene-solr 3.6.2%2Bdfsg-27
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 91,144 kB
  • sloc: java: 465,555; xml: 24,939; javascript: 5,291; ruby: 3,453; jsp: 2,637; python: 1,619; sh: 1,556; perl: 1,407; cpp: 305; makefile: 41
file content (106 lines) | stat: -rwxr-xr-x 2,995 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env ruby
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

require 'marc'
require 'solr'

solr_url = ENV["SOLR_URL"] || "http://localhost:8983/solr"
marc_filename = ARGV[0]
file_number = marc_filename.scan(/\d\d/)
debug = ARGV[1] == "-debug"

$KCODE = 'UTF8'

mapping = {
  # :solr_field_name => String
  # :solr_field_name => Array of Strings
  # :solr_field_name => Proc  [Proc operates on record]
  #    String = 3 digit control field number or 3 digit data field number + subfield letter
  
  :id => '001',
  :subject_genre_facet => ['600v', '610v', '611v', '650v', '651v', '655a'], 
  :subject_era_facet => ['650d', '650y', '651y', '655y'],
  :subject_topic_facet => ['650a', '650b', '650x'],
  :subject_geographic_facet => ['650c', '650z', '651a', '651x', '651z', '655z'],
  :year_facet => Proc.new do |r|
    extract_record_data(r,'260c').collect {|f| f.scan(/\d\d\d\d/)}.flatten
  end,
  :title_text => '245a',
  :author_text => '100a',
  :call_number_text => '050a',
  :isbn_text => '010a',
  :filename_facet => Proc.new {|r| file_number},
}

connection = Solr::Connection.new(solr_url)

if marc_filename =~ /.gz$/
  puts "Unzipping data file..."
  temp_filename = "/tmp/marc_data_#{file_number}.mrc"
  system("cp #{marc_filename} #{temp_filename}.gz")
  system("gunzip #{temp_filename}")
  marc_filename = temp_filename
end

reader = MARC::Reader.new(marc_filename)
count = 0

def extract_record_data(record, fields)
  extracted_data = []

  fields.each do |field|
    tag = field[0,3]
    
    extracted_fields = record.find_all {|f| f.tag === tag}

    extracted_fields.each do |field_instance|    
      if tag < '010' # control field
        extracted_data << field_instance.value rescue nil
      else # data field
        subfield = field[3].chr
        extracted_data << field_instance[subfield] rescue nil
      end
    end
  end
  
  extracted_data.compact.uniq
end

puts "Indexing #{marc_filename}..."
for record in reader
  doc = {}
  mapping.each do |key,value|
    data = nil
    case value
      when Proc
        data = value.call(record)
        
      when String, Array
        data = extract_record_data(record, value)
        data = nil if data.empty?
    end
    
    doc[key] = data if data
  end
  
  puts doc.inspect,"------" if debug

  connection.send(Solr::Request::AddDocument.new(doc)) unless debug
  
  count += 1
  
  puts count if count % 100 == 0
end

connection.send(Solr::Request::Commit.new) unless debug
puts "Done"