File: docx2txt.sh

package info (click to toggle)
docx2txt 1.4-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 188 kB
  • sloc: perl: 391; sh: 49; makefile: 35
file content (120 lines) | stat: -rwxr-xr-x 3,363 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env bash

# docx2txt, a command-line utility to convert Docx documents to text format.
# Copyright (C) 2008 Sandeep Kumar
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

#
# A simple .docx to .txt converter
#
# This script is a wrapper around core docx2txt.pl and saves text output for
# (filename or) filename.docx in filename.txt .
#
# Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
#
# ChangeLog :
#
#    10/08/2008 - Initial version (v0.1)
#    15/08/2008 - Invoking docx2txt.pl with docx document instead of xml file,
#                 so don't need unzip and rm actions now.
#                 Removed dependency on sed for generating output filename.
#    23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
#                 Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
#                 during installation.
#    15/09/2009 - Added support for directory (holding unzipped content of
#                 .docx file) argument to keep it's usage in sync with main
#                 docx2txt.pl script.
#                 Fixed bug in condition check for input file accessibility.
#    25/11/2009 - Fixed bug in set expression that was resulting in incorrect
#                 handling of file/directory names containing spaces.
#


MYLOC=`dirname "$0"`	# invoked perl script docx2txt.pl is expected here.

function usage ()
{
    cat << _USAGE_

Usage : $0 <file.docx>

	<file.docx> can also specify a directory holding the unzipped
	content of a .docx file.

_USAGE_

    exit 1
}

[ $# != 1 ] && usage

#
# Remove trailing '/'s if any, when input specifies a directory.
#
shopt -s extglob
set "${1%%+(/)}"

if [ -d "$1" ]
then
    if ! [ -r "$1" -a -x "$1" ]
    then
        echo -e "\nCan't access/read input directory <$1>!\n"
        exit 1
    fi
elif ! [ -f "$1" -a -r "$1" -a -s "$1" ]
then
    echo -e "\nCheck if <$1> exists, is readable and has non-zero size!\n"
    exit 1
fi


TEXTFILE=${1/%.docx/.txt}
[ "$1" == "$TEXTFILE" ] && TEXTFILE="$1.txt" 


#
# $1 : filename to check for existence
# $2 : message regarding file
#
function check_for_existence ()
{
    if [ -f "$1" ]
    then
        read -p "overwrite $2 <$1> [y/n] ? " yn
        if [ "$yn" != "y" ]
        then
            echo -e "\nPlease copy <$1> somewhere before running the script.\n"
            echeck=1
        fi
    fi
}

echeck=0
check_for_existence "$TEXTFILE" "Output text file"
[ $echeck -ne 0 ] && exit 1

#
# Invoke perl script to do the actual text extraction
#

"$MYLOC/docx2txt.pl" "$1" "$TEXTFILE"
if [ $? == 0 ]
then
    echo -e "\nText extracted from <$1> is available in <$TEXTFILE>.\n"
else
    echo -e "\nFailed to extract text from <$1>!\n"
fi