1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
|
Description: Try to reduce confusion around docx files
Now also checks for XML files and HTML files
Author: Olly Betts <olly@survex.com>
Bug-Debian: https://bugs.debian.org/758959
Bug-Debian: https://bugs.debian.org/791532
Forwarded: no
Last-Update: 2015-01-11
--- a/Docs/antiword.1
+++ b/Docs/antiword.1
@@ -14,7 +14,11 @@
.br
A wordfile named - stands for a Word document read from the standard input.
.br
-Only documents made by MS Word version 2 and version 6 or later are supported.
+Only the binary format documents made by MS Word version 2, 6, 7, 97, 2000 and
+2003 are supported. Newer Word versions default to using a completely
+different format consisting of XML files in a ZIP container (usually with a
+".docx" file extension) which antiword doesn't support. It also doesn't
+support the "flat" XML format which MS Word 2003 supported.
.SH OPTIONS
.TP
.BI "\-a " papersize
--- a/antiword.h
+++ b/antiword.h
@@ -695,6 +695,9 @@
extern BOOL bIsWordForDosFile(FILE *, long);
extern BOOL bIsRtfFile(FILE *);
extern BOOL bIsWordPerfectFile(FILE *);
+extern BOOL bIsZipFile(FILE *);
+extern BOOL bIsXMLFile(FILE *);
+extern BOOL bIsHTMLFile(FILE *);
extern BOOL bIsWinWord12File(FILE *, long);
extern BOOL bIsMacWord45File(FILE *);
extern int iGuessVersionNumber(FILE *, long);
--- a/main_u.c
+++ b/main_u.c
@@ -187,10 +187,29 @@
werr(0, "%s is not a Word Document."
" It is probably a Rich Text Format file",
szFilename);
- } if (bIsWordPerfectFile(pFile)) {
+ } else if (bIsWordPerfectFile(pFile)) {
werr(0, "%s is not a Word Document."
" It is probably a Word Perfect file",
szFilename);
+ } else if (bIsZipFile(pFile)) {
+ werr(0, "%s is not a Word Document."
+ " It seems to be a ZIP file, so is probably"
+ " an OpenDocument file, or a \"docx\" file"
+ " from MS Word 2007 or newer"
+ " (antiword only handles binary format"
+ " documents from MS Word 2003 and earlier)",
+ szFilename);
+ } else if (bIsXMLFile(pFile)) {
+ werr(0, "%s is not a Word Document."
+ " It seems to be an XML file, perhaps"
+ " the XML format from MS Word 2003"
+ " (antiword only handles binary format"
+ " documents from MS Word 2003 and earlier)",
+ szFilename);
+ } else if (bIsHTMLFile(pFile)) {
+ werr(0, "%s is not a Word Document."
+ " It is probably an HTML file",
+ szFilename);
} else {
#if defined(__dos)
werr(0, "%s is not a Word Document or the filename"
--- a/wordlib.c
+++ b/wordlib.c
@@ -41,7 +41,7 @@
BOOL
bIsWordForDosFile(FILE *pFile, long lFilesize)
{
- static UCHAR aucBytes[] =
+ static const UCHAR aucBytes[] =
{ 0x31, 0xbe, 0x00, 0x00, 0x00, 0xab }; /* Word for DOS */
DBG_MSG("bIsWordForDosFile");
@@ -64,7 +64,7 @@
static BOOL
bIsWordFileWithOLE(FILE *pFile, long lFilesize)
{
- static UCHAR aucBytes[] =
+ static const UCHAR aucBytes[] =
{ 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
int iTailLen;
@@ -108,7 +108,7 @@
BOOL
bIsRtfFile(FILE *pFile)
{
- static UCHAR aucBytes[] =
+ static const UCHAR aucBytes[] =
{ '{', '\\', 'r', 't', 'f', '1' };
DBG_MSG("bIsRtfFile");
@@ -122,7 +122,7 @@
BOOL
bIsWordPerfectFile(FILE *pFile)
{
- static UCHAR aucBytes[] =
+ static const UCHAR aucBytes[] =
{ 0xff, 'W', 'P', 'C' };
DBG_MSG("bIsWordPerfectFile");
@@ -131,13 +131,65 @@
} /* end of bIsWordPerfectFile */
/*
+ * This function checks whether the given file is or is not a ZIP file
+ */
+BOOL
+bIsZipFile(FILE *pFile)
+{
+ static const UCHAR aucBytes[] =
+ { 'P', 'K', 0x03, 0x04 };
+
+ DBG_MSG("bIsZipFile");
+
+ return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
+} /* end of bIsZipFile */
+
+/*
+ * This function checks whether the given file is or is not a XML file
+ */
+BOOL
+bIsXMLFile(FILE *pFile)
+{
+ static const UCHAR aucBytes[] =
+ { '<', '?', 'x', 'm', 'l' };
+
+ DBG_MSG("bIsXMLFile");
+
+ return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
+} /* end of bIsXMLFile */
+
+/*
+ * This function checks whether the given file is or is not a HTML file
+ */
+BOOL
+bIsHTMLFile(FILE *pFile)
+{
+ static const UCHAR aucBytes[2][5] = {
+ { '<', 'h', 't', 'm', 'l' },
+ { '<', 'H', 'T', 'M', 'L' },
+ };
+ int iIndex;
+
+ DBG_MSG("bIsHTMLFile");
+
+ for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
+ if (bCheckBytes(pFile,
+ aucBytes[iIndex],
+ elementsof(aucBytes[iIndex]))) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+} /* end of bIsHTMLFile */
+
+/*
* This function checks whether the given file is or is not a "Win Word 1 or 2"
* document
*/
BOOL
bIsWinWord12File(FILE *pFile, long lFilesize)
{
- static UCHAR aucBytes[2][4] = {
+ static const UCHAR aucBytes[2][4] = {
{ 0x9b, 0xa5, 0x21, 0x00 }, /* Win Word 1.x */
{ 0xdb, 0xa5, 0x2d, 0x00 }, /* Win Word 2.0 */
};
@@ -171,7 +223,7 @@
BOOL
bIsMacWord45File(FILE *pFile)
{
- static UCHAR aucBytes[2][6] = {
+ static const UCHAR aucBytes[2][6] = {
{ 0xfe, 0x37, 0x00, 0x1c, 0x00, 0x00 }, /* Mac Word 4 */
{ 0xfe, 0x37, 0x00, 0x23, 0x00, 0x00 }, /* Mac Word 5 */
};
|