1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
{-# LANGUAGE OverloadedStrings #-}
import Test.HUnit hiding (Test)
import Test.Hspec
import Test.Hspec.QuickCheck
import Data.ByteString.Lazy.Char8 ()
import qualified Text.HTML.DOM as H
import qualified Text.XML as X
import qualified Data.Map as Map
import qualified Data.Text as T
import Control.Exception (evaluate)
import Control.DeepSeq (($!!))
import Control.Monad (void)
main :: IO ()
main = hspec $ do
describe "parses" $ do
it "well-formed document" $
X.parseLBS_ X.def "<foo><bar>baz</bar></foo>" @=?
H.parseLBS "<foo><bar>baz</bar></foo>"
it "adds missing close tags" $
X.parseLBS_ X.def "<foo><bar>baz</bar></foo>" @=?
H.parseLBS "<foo><bar>baz</foo>"
it "void tags" $
X.parseLBS_ X.def "<foo><bar><img/>foo</bar></foo>" @=?
H.parseLBS "<foo><bar><img>foo</foo>"
it "xml entities" $
X.parseLBS_ X.def "<foo><bar>baz></bar></foo>" @=?
H.parseLBS "<foo><bar>baz></foo>"
it "html entities" $
X.parseLBS_ X.def "<foo><bar>baz </bar></foo>" @=?
H.parseLBS "<foo><bar>baz </foo>"
it "decimal entities" $
X.parseLBS_ X.def "<foo><bar>baz </bar></foo>" @=?
H.parseLBS "<foo><bar>baz </foo>"
it "hex entities" $
X.parseLBS_ X.def "<foo><bar>bazŠ</bar></foo>" @=?
H.parseLBS "<foo><bar>bazŠ</foo>"
it "invalid entities" $
X.parseLBS_ X.def "<foo><bar>baz&foobar;</bar></foo>" @=?
H.parseLBS "<foo><bar>baz&foobar;</foo>"
it "multiple root elements" $
X.parseLBS_ X.def "<html><foo><bar>baz&foobar;</bar></foo><foo/></html>" @=?
H.parseLBS "<foo><bar>baz&foobar;</foo><foo>"
it "doesn't strip whitespace" $
X.parseLBS_ X.def "<foo> hello</foo>" @=?
H.parseLBS "<foo> hello</foo>"
it "split code-points" $
X.parseLBS_ X.def "<foo> </foo>" @=?
H.parseBSChunks ["<foo>\xc2", "\xa0</foo>"]
it "latin1 codes" $
X.parseText_ X.def "<foo>\232</foo>" @=?
H.parseSTChunks ["<foo>\232</foo>"]
it "latin1 codes strict vs lazy" $
H.parseLT "<foo>\232</foo>" @=?
H.parseSTChunks ["<foo>\232</foo>"]
describe "HTML parsing" $ do
it "XHTML" $
let html = "<html><head><title>foo</title></head><body><p>Hello World</p></body></html>"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "XHTML with doctype and <?xml #30" $ do
let html = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html><head><title>foo</title></head><body><p>Hello World</p></body></html>"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "HTML" $
let html = "<html><head><title>foo</title></head><body><br><p>Hello World</p></body></html>"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "br" Map.empty []
, X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "Mixed case br #167" $
let html = "<html><head><title>foo</title></head><body><bR><p>Hello World</p><BR>done</body></html>"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "bR" Map.empty []
, X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
, X.NodeElement $ X.Element "BR" Map.empty []
, X.NodeContent "done"
]
]
in H.parseLBS html @?= doc
it "doesn't double unescape" $
let html = "<p>Hello &gt; World</p>"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "p" Map.empty
[ X.NodeContent "Hello > World"
]
in H.parseLBS html @?= doc
it "handles entities in attributes" $
let html = "<br title=\"Mac & Cheese\">"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "br" (Map.singleton "title" "Mac & Cheese") []
in H.parseLBS html @?= doc
it "doesn't double escape entities in attributes" $
let html = "<br title=\"Mac &amp; Cheese\">"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "br" (Map.singleton "title" "Mac & Cheese") []
in H.parseLBS html @?= doc
describe "script tags" $ do
it "ignores funny characters" $
let html = "<script>hello <> world</script>"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "script" Map.empty [X.NodeContent "hello <> world"]
in H.parseLBS html @?= doc
{-
Would be nice... doesn't work with tagstream-conduit original
code. Not even sure if the HTML5 parser spec discusses this
case.
it "ignores </script> inside string" $
let html = "<script>hello \"</script>\" world</script>"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "script" Map.empty [X.NodeContent "hello \"</script>\" world"]
in H.parseLBS html @?= doc
-}
it "unterminated" $
let html = "<script>hello > world"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "script" Map.empty [X.NodeContent "hello > world"]
in H.parseLBS html @?= doc
it "entities" $
let html = "<script>hello & world"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "script" Map.empty [X.NodeContent "hello & world"]
in H.parseLBS html @?= doc
prop "parses all random input" $ \strs -> void $ evaluate $!! H.parseSTChunks $ map T.pack strs
describe "#128 entities cut off" $ do
it "reported issue" $ do
let html = "<a href=\"https://example.com?a=b&c=d\">link</a>"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element
"a"
(Map.singleton "href" "https://example.com?a=b&c=d")
[X.NodeContent "link"]
in H.parseLBS html @?= doc
it "from test suite" $ do
let html = "<a class=\"u-url\" href=\"https://secure.gravatar.com/avatar/947b5f3f323da0ef785b6f02d9c265d6?s=96&d=blank&r=g\">link</a>"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element
"a"
(Map.fromList
[ ("href", "https://secure.gravatar.com/avatar/947b5f3f323da0ef785b6f02d9c265d6?s=96&d=blank&r=g")
, ("class", "u-url")
])
[X.NodeContent "link"]
in H.parseLBS html @?= doc
|