Package: antiword / 0.37-14

docx.patch Patch series | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
Description: Try to reduce confusion around docx files
 Now also checks for XML files and HTML files
Author: Olly Betts <olly@survex.com>
Bug-Debian: https://bugs.debian.org/758959
Bug-Debian: https://bugs.debian.org/791532
Forwarded: no
Last-Update: 2015-01-11

--- a/Docs/antiword.1
+++ b/Docs/antiword.1
@@ -14,7 +14,11 @@
 .br
 A wordfile named - stands for a Word document read from the standard input.
 .br
-Only documents made by MS Word version 2 and version 6 or later are supported.
+Only the binary format documents made by MS Word version 2, 6, 7, 97, 2000 and
+2003 are supported.  Newer Word versions default to using a completely
+different format consisting of XML files in a ZIP container (usually with a
+".docx" file extension) which antiword doesn't support.  It also doesn't
+support the "flat" XML format which MS Word 2003 supported.
 .SH OPTIONS
 .TP
 .BI "\-a " papersize
--- a/antiword.h
+++ b/antiword.h
@@ -695,6 +695,9 @@
 extern BOOL	bIsWordForDosFile(FILE *, long);
 extern BOOL	bIsRtfFile(FILE *);
 extern BOOL	bIsWordPerfectFile(FILE *);
+extern BOOL	bIsZipFile(FILE *);
+extern BOOL	bIsXMLFile(FILE *);
+extern BOOL	bIsHTMLFile(FILE *);
 extern BOOL	bIsWinWord12File(FILE *, long);
 extern BOOL	bIsMacWord45File(FILE *);
 extern int	iGuessVersionNumber(FILE *, long);
--- a/main_u.c
+++ b/main_u.c
@@ -187,10 +187,29 @@
 			werr(0, "%s is not a Word Document."
 				" It is probably a Rich Text Format file",
 				szFilename);
-		} if (bIsWordPerfectFile(pFile)) {
+		} else if (bIsWordPerfectFile(pFile)) {
 			werr(0, "%s is not a Word Document."
 				" It is probably a Word Perfect file",
 				szFilename);
+		} else if (bIsZipFile(pFile)) {
+			werr(0, "%s is not a Word Document."
+				" It seems to be a ZIP file, so is probably"
+				" an OpenDocument file, or a \"docx\" file"
+				" from MS Word 2007 or newer"
+				" (antiword only handles binary format"
+				" documents from MS Word 2003 and earlier)",
+				szFilename);
+		} else if (bIsXMLFile(pFile)) {
+			werr(0, "%s is not a Word Document."
+				" It seems to be an XML file, perhaps"
+				" the XML format from MS Word 2003"
+				" (antiword only handles binary format"
+				" documents from MS Word 2003 and earlier)",
+				szFilename);
+		} else if (bIsHTMLFile(pFile)) {
+			werr(0, "%s is not a Word Document."
+				" It is probably an HTML file",
+				szFilename);
 		} else {
 #if defined(__dos)
 			werr(0, "%s is not a Word Document or the filename"
--- a/wordlib.c
+++ b/wordlib.c
@@ -41,7 +41,7 @@
 BOOL
 bIsWordForDosFile(FILE *pFile, long lFilesize)
 {
-	static UCHAR	aucBytes[] =
+	static const UCHAR	aucBytes[] =
 		{ 0x31, 0xbe, 0x00, 0x00, 0x00, 0xab };	/* Word for DOS */
 
 	DBG_MSG("bIsWordForDosFile");
@@ -64,7 +64,7 @@
 static BOOL
 bIsWordFileWithOLE(FILE *pFile, long lFilesize)
 {
-	static UCHAR	aucBytes[] =
+	static const UCHAR	aucBytes[] =
 		{ 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
 	int	iTailLen;
 
@@ -108,7 +108,7 @@
 BOOL
 bIsRtfFile(FILE *pFile)
 {
-	static UCHAR	aucBytes[] =
+	static const UCHAR	aucBytes[] =
 		{ '{', '\\', 'r', 't', 'f', '1' };
 
 	DBG_MSG("bIsRtfFile");
@@ -122,7 +122,7 @@
 BOOL
 bIsWordPerfectFile(FILE *pFile)
 {
-	static UCHAR	aucBytes[] =
+	static const UCHAR	aucBytes[] =
 		{ 0xff, 'W', 'P', 'C' };
 
 	DBG_MSG("bIsWordPerfectFile");
@@ -131,13 +131,65 @@
 } /* end of bIsWordPerfectFile */
 
 /*
+ * This function checks whether the given file is or is not a ZIP file
+ */
+BOOL
+bIsZipFile(FILE *pFile)
+{
+	static const UCHAR	aucBytes[] =
+		{ 'P', 'K', 0x03, 0x04 };
+
+	DBG_MSG("bIsZipFile");
+
+	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
+} /* end of bIsZipFile */
+
+/*
+ * This function checks whether the given file is or is not a XML file
+ */
+BOOL
+bIsXMLFile(FILE *pFile)
+{
+	static const UCHAR	aucBytes[] =
+		{ '<', '?', 'x', 'm', 'l' };
+
+	DBG_MSG("bIsXMLFile");
+
+	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
+} /* end of bIsXMLFile */
+
+/*
+ * This function checks whether the given file is or is not a HTML file
+ */
+BOOL
+bIsHTMLFile(FILE *pFile)
+{
+	static const UCHAR	aucBytes[2][5] = {
+		{ '<', 'h', 't', 'm', 'l' },
+		{ '<', 'H', 'T', 'M', 'L' },
+	};
+	int	iIndex;
+
+	DBG_MSG("bIsHTMLFile");
+
+	for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
+		if (bCheckBytes(pFile,
+				aucBytes[iIndex],
+				elementsof(aucBytes[iIndex]))) {
+			return TRUE;
+		}
+	}
+	return FALSE;
+} /* end of bIsHTMLFile */
+
+/*
  * This function checks whether the given file is or is not a "Win Word 1 or 2"
  * document
  */
 BOOL
 bIsWinWord12File(FILE *pFile, long lFilesize)
 {
-	static UCHAR	aucBytes[2][4] = {
+	static const UCHAR	aucBytes[2][4] = {
 		{ 0x9b, 0xa5, 0x21, 0x00 },	/* Win Word 1.x */
 		{ 0xdb, 0xa5, 0x2d, 0x00 },	/* Win Word 2.0 */
 	};
@@ -171,7 +223,7 @@
 BOOL
 bIsMacWord45File(FILE *pFile)
 {
-	static UCHAR	aucBytes[2][6] = {
+	static const UCHAR	aucBytes[2][6] = {
 		{ 0xfe, 0x37, 0x00, 0x1c, 0x00, 0x00 },	/* Mac Word 4 */
 		{ 0xfe, 0x37, 0x00, 0x23, 0x00, 0x00 },	/* Mac Word 5 */
 	};