File: docx.patch

package info (click to toggle)
antiword 0.37-16
links: PTS
area: main
in suites: bookworm, bullseye, sid, trixie
size: 2,236 kB
sloc: ansic: 27,835; perl: 174; sh: 129; php: 83; makefile: 24
file content (182 lines) | stat: -rw-r--r-- 5,062 bytes
parent folder | download | duplicates (3)
Description: Try to reduce confusion around docx files
 Now also checks for XML files and HTML files
Author: Olly Betts <olly@survex.com>
Bug-Debian: https://bugs.debian.org/758959
Bug-Debian: https://bugs.debian.org/791532
Forwarded: no
Last-Update: 2015-01-11

--- a/Docs/antiword.1
+++ b/Docs/antiword.1
@@ -14,7 +14,11 @@
 .br
 A wordfile named - stands for a Word document read from the standard input.
 .br
-Only documents made by MS Word version 2 and version 6 or later are supported.
+Only the binary format documents made by MS Word version 2, 6, 7, 97, 2000 and
+2003 are supported.  Newer Word versions default to using a completely
+different format consisting of XML files in a ZIP container (usually with a
+".docx" file extension) which antiword doesn't support.  It also doesn't
+support the "flat" XML format which MS Word 2003 supported.
 .SH OPTIONS
 .TP
 .BI "\-a " papersize
--- a/antiword.h
+++ b/antiword.h
@@ -695,6 +695,9 @@
 extern BOOL	bIsWordForDosFile(FILE *, long);
 extern BOOL	bIsRtfFile(FILE *);
 extern BOOL	bIsWordPerfectFile(FILE *);
+extern BOOL	bIsZipFile(FILE *);
+extern BOOL	bIsXMLFile(FILE *);
+extern BOOL	bIsHTMLFile(FILE *);
 extern BOOL	bIsWinWord12File(FILE *, long);
 extern BOOL	bIsMacWord45File(FILE *);
 extern int	iGuessVersionNumber(FILE *, long);
--- a/main_u.c
+++ b/main_u.c
@@ -187,10 +187,29 @@
 			werr(0, "%s is not a Word Document."
 				" It is probably a Rich Text Format file",
 				szFilename);
-		} if (bIsWordPerfectFile(pFile)) {
+		} else if (bIsWordPerfectFile(pFile)) {
 			werr(0, "%s is not a Word Document."
 				" It is probably a Word Perfect file",
 				szFilename);
+		} else if (bIsZipFile(pFile)) {
+			werr(0, "%s is not a Word Document."
+				" It seems to be a ZIP file, so is probably"
+				" an OpenDocument file, or a \"docx\" file"
+				" from MS Word 2007 or newer"
+				" (antiword only handles binary format"
+				" documents from MS Word 2003 and earlier)",
+				szFilename);
+		} else if (bIsXMLFile(pFile)) {
+			werr(0, "%s is not a Word Document."
+				" It seems to be an XML file, perhaps"
+				" the XML format from MS Word 2003"
+				" (antiword only handles binary format"
+				" documents from MS Word 2003 and earlier)",
+				szFilename);
+		} else if (bIsHTMLFile(pFile)) {
+			werr(0, "%s is not a Word Document."
+				" It is probably an HTML file",
+				szFilename);
 		} else {
 #if defined(__dos)
 			werr(0, "%s is not a Word Document or the filename"
--- a/wordlib.c
+++ b/wordlib.c
@@ -41,7 +41,7 @@
 BOOL
 bIsWordForDosFile(FILE *pFile, long lFilesize)
 {
-	static UCHAR	aucBytes[] =
+	static const UCHAR	aucBytes[] =
 		{ 0x31, 0xbe, 0x00, 0x00, 0x00, 0xab };	/* Word for DOS */
 
 	DBG_MSG("bIsWordForDosFile");
@@ -64,7 +64,7 @@
 static BOOL
 bIsWordFileWithOLE(FILE *pFile, long lFilesize)
 {
-	static UCHAR	aucBytes[] =
+	static const UCHAR	aucBytes[] =
 		{ 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
 	int	iTailLen;
 
@@ -108,7 +108,7 @@
 BOOL
 bIsRtfFile(FILE *pFile)
 {
-	static UCHAR	aucBytes[] =
+	static const UCHAR	aucBytes[] =
 		{ '{', '\\', 'r', 't', 'f', '1' };
 
 	DBG_MSG("bIsRtfFile");
@@ -122,7 +122,7 @@
 BOOL
 bIsWordPerfectFile(FILE *pFile)
 {
-	static UCHAR	aucBytes[] =
+	static const UCHAR	aucBytes[] =
 		{ 0xff, 'W', 'P', 'C' };
 
 	DBG_MSG("bIsWordPerfectFile");
@@ -131,13 +131,65 @@
 } /* end of bIsWordPerfectFile */
 
 /*
+ * This function checks whether the given file is or is not a ZIP file
+ */
+BOOL
+bIsZipFile(FILE *pFile)
+{
+	static const UCHAR	aucBytes[] =
+		{ 'P', 'K', 0x03, 0x04 };
+
+	DBG_MSG("bIsZipFile");
+
+	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
+} /* end of bIsZipFile */
+
+/*
+ * This function checks whether the given file is or is not a XML file
+ */
+BOOL
+bIsXMLFile(FILE *pFile)
+{
+	static const UCHAR	aucBytes[] =
+		{ '<', '?', 'x', 'm', 'l' };
+
+	DBG_MSG("bIsXMLFile");
+
+	return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
+} /* end of bIsXMLFile */
+
+/*
+ * This function checks whether the given file is or is not a HTML file
+ */
+BOOL
+bIsHTMLFile(FILE *pFile)
+{
+	static const UCHAR	aucBytes[2][5] = {
+		{ '<', 'h', 't', 'm', 'l' },
+		{ '<', 'H', 'T', 'M', 'L' },
+	};
+	int	iIndex;
+
+	DBG_MSG("bIsHTMLFile");
+
+	for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
+		if (bCheckBytes(pFile,
+				aucBytes[iIndex],
+				elementsof(aucBytes[iIndex]))) {
+			return TRUE;
+		}
+	}
+	return FALSE;
+} /* end of bIsHTMLFile */
+
+/*
  * This function checks whether the given file is or is not a "Win Word 1 or 2"
  * document
  */
 BOOL
 bIsWinWord12File(FILE *pFile, long lFilesize)
 {
-	static UCHAR	aucBytes[2][4] = {
+	static const UCHAR	aucBytes[2][4] = {
 		{ 0x9b, 0xa5, 0x21, 0x00 },	/* Win Word 1.x */
 		{ 0xdb, 0xa5, 0x2d, 0x00 },	/* Win Word 2.0 */
 	};
@@ -171,7 +223,7 @@
 BOOL
 bIsMacWord45File(FILE *pFile)
 {
-	static UCHAR	aucBytes[2][6] = {
+	static const UCHAR	aucBytes[2][6] = {
 		{ 0xfe, 0x37, 0x00, 0x1c, 0x00, 0x00 },	/* Mac Word 4 */
 		{ 0xfe, 0x37, 0x00, 0x23, 0x00, 0x00 },	/* Mac Word 5 */
 	};