File: simstrings

package info (click to toggle)
r-cran-qlcmatrix 0.9.8-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,368 kB
  • sloc: makefile: 2
file content (94 lines) | stat: -rwxr-xr-x 2,542 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env Rscript

# =================
# Copyright 2015 Michael Cysouw <cysouw@mac.com>
#
# This file is free software: you may copy, redistribute and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version.
#
# This file is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# =================

# =====
# usage
# =====

DOC <- "
USAGE: 
  simstrings [-h -d -s SEPARATOR -r DECIMALS <STRINGS>...]

DESCRIPTION:
  Using the function sim.strings() from the R-package qlcMatrix:
  Efficient computation of pairwise string similarities using a cosine 
  similarity on bigram vectors. Note that the algorithm is efficient, but there
  is an overhead beacuse of the startup of R and loading packages.
  
  The STRINGS argument should be space-separated. Piping through is supported, see examples.
  Details at http://www.rdocumentation.org/packages/qlcMatrix/functions/sim.strings.html

OPTIONS:
  -h, --help      Showing this help text
  -d, --distance  Return distances instead of similarities
  -s SEPARATOR    Separator, defaults to nothing (i.e. separation at Unicode codepoints);
                  use 'S' to get space [default: ]
  -r DECIMALS     Round result to decimals [default: 3]

EXAMPLES

  simstrings abcd abdd abbb
  ls | simstrings
"

# ==============
# docopt parsing
# ==============

attach(docopt::docopt(DOC))

# for piping data
if (length(STRINGS) == 0) {
	STRINGS <- scan(file("stdin") , sep = "\n" , quiet = TRUE , what = "character")
	closeAllConnections() 
}

# default values are strings by default, not numbers
r <- as.numeric(r)

# space cannot be passed as argument in bash
if (s == "S") {s <- " "}

# ======
# R code
# ======

library(methods) # this declaration is a bug; should not be necessary

result <- qlcMatrix::sim.strings(STRINGS, sep = s)
if (distance) {
	result <- as.matrix(1 - result)
} else {
	result <- as.matrix(result)
}
result <- round(result, digits = r)


# =======================
# Return output to stdout
# =======================

write.table(result
	, file = ""
	, sep = "\t"
	, row.names = FALSE
	, col.names = FALSE
	, quote = FALSE
	)