File: medline.rb

package info (click to toggle)
ruby-bio 1.5.0-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 7,480 kB
  • ctags: 9,428
  • sloc: ruby: 74,117; xml: 3,383; makefile: 17; perl: 13; sh: 1
file content (331 lines) | stat: -rw-r--r-- 7,622 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#
# = bio/db/medline.rb - NCBI PubMed/MEDLINE database class
#
# Copyright::  Copyright (C) 2001, 2005
#              Toshiaki Katayama <k@bioruby.org>
# License::    The Ruby License
#
# $Id: medline.rb,v 1.17 2007/12/21 05:12:41 k Exp $
#

require 'bio/db'

module Bio

# == Description
#
# NCBI PubMed/MEDLINE database class.
#
# == Examples
#
#   medline = Bio::MEDLINE.new(txt)
#   medline.reference
#   medline.pmid == medline.entry_id
#   medilne.mesh
#
class MEDLINE < NCBIDB

  def initialize(entry)
    @pubmed = Hash.new('')

    tag = ''
    entry.each_line do |line|
      if line =~ /^\w/
        tag = line[0,4].strip
      else
        # continuation from previous lines
        @pubmed[tag] = @pubmed[tag].sub(/(?:\r|\r\n|\n)\z/, ' ')
      end
      value = line[6..-1]
      @pubmed[tag] += value if value
    end
  end
  attr_reader :pubmed


  # returns a Reference object.
  def reference
    hash = Hash.new

    hash['authors']	= authors
    hash['title']	= title
    hash['journal']	= journal
    hash['volume']	= volume
    hash['issue']	= issue
    hash['pages']	= pages
    hash['year']	= year
    hash['pubmed']	= pmid
    hash['medline']  	= ui
    hash['abstract']	= abstract
    hash['mesh']	= mesh
    hash['doi']	= doi
    hash['affiliations'] = affiliations

    hash.delete_if { |k, v| v.nil? or v.empty? }

    return Reference.new(hash)
  end


  ### Common MEDLINE tags

  # PMID - PubMed Unique Identifier
  #   Unique number assigned to each PubMed citation.
  def pmid
    @pubmed['PMID'].strip
  end
  alias entry_id pmid

  # UI   - MEDLINE Unique Identifier
  #   Unique number assigned to each MEDLINE citation.
  def ui
    @pubmed['UI'].strip
  end

  # TA   - Journal Title Abbreviation
  #   Standard journal title abbreviation.
  def ta
    @pubmed['TA'].gsub(/\s+/, ' ').strip
  end
  alias journal ta

  # VI   - Volume
  #   Journal volume.
  def vi
    @pubmed['VI'].strip
  end
  alias volume vi

  # IP   - Issue
  #   The number of the issue, part, or supplement of the journal in which
  #   the article was published.
  def ip
    @pubmed['IP'].strip
  end
  alias issue ip

  # PG   - Page Number
  #   The full pagination of the article.
  def pg
    @pubmed['PG'].strip
  end

  def pages
    pages = pg
    if pages =~ /-/
      from, to = pages.split('-')
      if (len = from.length - to.length) > 0
        to = from[0,len] + to
      end
      pages = "#{from}-#{to}"
    end
    return pages
  end

  # DP   - Publication Date
  #   The date the article was published.
  def dp
    @pubmed['DP'].strip
  end
  alias date dp

  def year
    dp[0,4]
  end

  # TI   - Title Words
  #   The title of the article.
  def ti
    @pubmed['TI'].gsub(/\s+/, ' ').strip
  end
  alias title ti

  # AB   - Abstract
  #   Abstract.
  def ab
    @pubmed['AB'].gsub(/\s+/, ' ').strip
  end
  alias abstract ab

  # AU   - Author Name
  #   Authors' names.
  def au
    @pubmed['AU'].strip
  end

  def authors
    authors = []
    au.split(/\n/).each do |author|
      if author =~ / /
        name = author.split(/\s+/)
        suffix = nil
        if name.length > 2 && name[-2] =~ /^[A-Z]+$/ # second to last are the initials
          suffix = name.pop
        end
        initial = name.pop.split(//).join('. ')
        author = "#{name.join(' ')}, #{initial}."
      end
      if suffix
        author << " " + suffix
      end
      authors.push(author)
    end
    return authors
  end

  # SO   - Source
  #   Composite field containing bibliographic information.
  def so
    @pubmed['SO'].strip
  end
  alias source so

  # MH   - MeSH Terms
  #   NLM's controlled vocabulary.
  def mh
    @pubmed['MH'].strip.split(/\n/)
  end
  alias mesh mh

  # AD   - Affiliation
  #   Institutional affiliation and address of the first author, and grant
  #   numbers.
  def ad
    @pubmed['AD'].strip.split(/\n/)
  end
  alias affiliations ad

  # AID  - Article Identifier
  #   Article ID values may include the pii (controlled publisher identifier)
  #   or doi (Digital Object Identifier).
  def doi
    @pubmed['AID'][/(\S+) \[doi\]/, 1]
  end

  def pii
    @pubmed['AID'][/(\S+) \[pii\]/, 1]
  end

  ### Other MEDLINE tags

  # CI   - Copyright Information
  #   Copyright statement.

  # CIN  - Comment In
  #   Reference containing a comment about the article.

  # CN   - Collective Name
  #   Corporate author or group names with authorship responsibility.

  # CON  - Comment On
  #   Reference upon which the article comments.

  # CY   - Country
  #   The place of publication of the journal.

  # DA   - Date Created
  #   Used for internal processing at NLM.

  # DCOM - Date Completed
  #   Used for internal processing at NLM.

  # DEP  - Date of Electronic Publication
  #   Electronic publication date.

  # EDAT - Entrez Date
  #   The date the citation was added to PubMed.

  # EIN  - Erratum In
  #   Reference containing a published erratum to the article.

  # GS   - Gene Symbol
  #   Abbreviated gene names (used 1991 through 1996).

  # ID   - Identification Number 
  #   Research grant numbers, contract numbers, or both that designate
  #   financial support by any agency of the US PHS (Public Health Service).

  # IS   - ISSN
  #   International Standard Serial Number of the journal.

  # JC   - Journal Title Code
  #   MEDLINE unique three-character code for the journal.

  # JID  - NLM Unique ID
  #   Unique journal ID in NLM's catalog of books, journals, and audiovisuals.

  # LA   - Language
  #   The language in which the article was published.

  # LR   - Last Revision Date
  #   The date a change was made to the record during a maintenance procedure.

  # MHDA - MeSH Date
  #   The date MeSH terms were added to the citation. The MeSH date is the
  #   same as the Entrez date until MeSH are added.

  # PHST - Publication History Status Date
  #   History status date.

  # PS   - Personal Name as Subject
  #   Individual is the subject of the article.

  # PST  - Publication Status
  #   Publication status.

  # PT   - Publication Type
  #   The type of material the article represents.
  def pt
    @pubmed['PT'].strip.split(/\n/)   
  end
  alias publication_type pt

  # RF   - Number of References
  #   Number of bibliographic references for Review articles.

  # RIN  - Retraction In
  #   Retraction of the article

  # RN   - EC/RN Number
  #   Number assigned by the Enzyme Commission to designate a particular
  #   enzyme or by the Chemical Abstracts Service for Registry Numbers.

  # ROF  - Retraction Of
  #   Article being retracted.

  # RPF  - Republished From
  #   Original article.

  # SB   - Journal Subset
  #   Code for a specific set of journals.

  # SI   - Secondary Source Identifier
  #   Identifies a secondary source that supplies information, e.g., other
  #   data sources, databanks and accession numbers of molecular sequences
  #   discussed in articles.

  # TT   - Transliterated / Vernacular Title 
  #   Non-Roman alphabet language titles are transliterated.

  # UIN  - Update In
  #   Update to the article.

  # UOF  - Update Of
  #   The article being updated.

  # URLF - URL Full-Text
  #   Link to the full-text of article at provider's website. Links are
  #   incomplete. Use PmLink for the complete set of available links.
  #   [PmLink] http://www.ncbi.nlm.nih.gov/entrez/utils/pmlink_help.html

  # URLS - URL Summary
  #   Link to the article summary at provider's website. Links are
  #   incomplete. Use PmLink for the complete set of available links.
  #   [PmLink] http://www.ncbi.nlm.nih.gov/entrez/utils/pmlink_help.html

end # MEDLINE

end # Bio