File: pinot-enum-index.sh

package info (click to toggle)
pinot 1.23-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 4,608 kB
  • sloc: cpp: 41,870; makefile: 611; xml: 416; sh: 336
file content (84 lines) | stat: -rwxr-xr-x 2,270 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
# A script that enumerates files in an index created by Pinot
# and gives an estimate of how much disk space those take.

if [ $# == 0 ]; then
  echo "Usage: $0 INDEX"
  exit 1
fi

# delve might be called something else
DELVE="delve"

# Check programs we need are available
WHICH_DELVE=`which delve 2>/dev/null`
if [ $? != 0 ]; then
  WHICH_DELVE=`which xapian-delve 2>/dev/null`
  if [ $? != 0 ]; then
    echo "Couldn't find delve. Is the xapian-core package installed ?"
    exit 1
  else
    DELVE="xapian-delve"
  fi
fi
WHICH_DU=`which du 2>/dev/null`
if [ $? != 0 ]; then
  echo "Couldn't find du. Is the coreutils package installed ?"
  exit 1
fi
WHICH_DC=`which dc 2>/dev/null`
if [ $? != 0 ]; then
  echo "Couldn't find dc. Is the bc package installed ?"
  exit 1
fi

if [ ! -d "$1" ]; then
  echo "$1 is not a directory"
  exit 1
fi

# Remove existing files
rm -f "$1/urls.txt" "$1/filesizes.txt"

# Get a list of documents
DOCIDS=`$DELVE -t X-MetaSE-Doc "$1" | sed -e "s/\(.*\): \(.*\)/\2/g"`
if [ $? != 0 ]; then
  echo "Couldn't query database at $1"
fi

echo "Listing documents in index"
echo "0" >> "$1/filesizes.txt"
for DOCID in $DOCIDS ;
do
  # Skip documents with a scheme other than file
  FILENAME=`$DELVE -d -r $DOCID "$1" | grep "url=file" | sed -e "s/url=\(.*\):\/\///g"`
  #echo "File name is $FILENAME"
  if [ $? == 0 ] && [ ! -z "$FILENAME" ]; then
    FILESIZE=`du -b "$FILENAME" | sed -e "s/\([0-9]*\)\(.*\)/\1/g"`
    if [ ! -z "$FILESIZE" ]; then
      echo $FILESIZE >> "$1/filesizes.txt"
      echo "+" >> "$1/filesizes.txt"
    fi
    echo "$FILENAME" >> "$1/urls.txt"
  else
    # Dump documents with a scheme other than file
    URL=`$DELVE -d -r $DOCID "$1" | grep "url=" | sed -e "s/url=//g"`
    #echo "URL is $URL"
    if [ $? == 0 ] && [ ! -z "$URL" ]; then
        FILESIZE=`$DELVE -d -r $DOCID "$1" | grep "size=" | sed -e "s/size=//g"`
        if [ ! -z "$FILESIZE" ]; then
          echo $FILESIZE >> "$1/filesizes.txt"
          echo "+" >> "$1/filesizes.txt"
        fi
        echo "$URL" >> "$1/urls.txt"
    fi
  fi
done
echo "n" >> "$1/filesizes.txt"
echo "List is in $1/urls.txt"

echo "Summarizing disk usage for indexed documents"
TOTALSIZE=`dc --file="$1/filesizes.txt"`
echo "$TOTALSIZE bytes"

exit 0