1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
|
module TagSoup.Sample where
import Text.HTML.TagSoup
import Network.HTTP
import Data.Char
import Data.List
openItem :: String -> IO String
openItem x | "http://" `isPrefixOf` x = getResponseBody =<< simpleHTTP (getRequest x)
| otherwise = readFile x
grab :: String -> IO ()
grab x = openItem x >>= putStr
parse :: String -> IO ()
parse x = openItem x >>= putStr . show2 . parseTags
where
show2 [] = "[]"
show2 xs = "[" ++ concat (intersperseNotBroken "\n," $ map show xs) ++ "\n]\n"
-- the standard intersperse has a strictness bug which sucks!
intersperseNotBroken :: a -> [a] -> [a]
intersperseNotBroken _ [] = []
intersperseNotBroken sep (x:xs) = x : is xs
where
is [] = []
is (y:ys) = sep : y : is ys
{-
<div class="printfooter">
<p>Retrieved from "<a href="http://haskell.org/haskellwiki/Haskell">http://haskell.org/haskellwiki/Haskell</a>"</p>
<p>This page has been accessed 507,753 times. This page was last modified 08:05, 24 January 2007. Recent content is available under <a href="/haskellwiki/HaskellWiki:Copyrights" title="HaskellWiki:Copyrights">a simple permissive license</a>.</p>
</div>
-}
haskellHitCount :: IO ()
haskellHitCount = do
tags <- fmap parseTags $ openItem "http://haskell.org/haskellwiki/Haskell"
let count = fromFooter $ head $ sections (~== "<div class=printfooter>") tags
putStrLn $ "haskell.org has been hit " ++ show count ++ " times"
where
fromFooter x = read (filter isDigit num) :: Int
where
num = ss !! (i - 1)
Just i = findIndex (== "times.") ss
ss = words s
TagText s = sections (~== "<p>") x !! 1 !! 1
googleTechNews :: IO ()
googleTechNews = do
tags <- fmap parseTags $ openItem "http://news.google.com/?ned=us&topic=t"
let links = [ ascii name ++ " <" ++ maybe "unknown" shortUrl (lookup "href" atts) ++ ">"
| TagOpen "h2" [("class","title")]:TagText spaces:TagOpen "a" atts:TagText name:_ <- tails tags]
putStr $ unlines links
where
shortUrl x | "http://" `isPrefixOf` x = shortUrl $ drop 7 x
| "www." `isPrefixOf` x = shortUrl $ drop 4 x
| otherwise = takeWhile (/= '/') x
ascii ('\226':'\128':'\147':xs) = '-' : ascii xs
ascii ('\194':'\163':xs) = "#GBP " ++ ascii xs
ascii (x:xs) = x : ascii xs
ascii [] = []
spjPapers :: IO ()
spjPapers = do
tags <- fmap parseTags $ openItem "http://research.microsoft.com/en-us/people/simonpj/"
let links = map f $ sections (~== "<A>") $
takeWhile (~/= "<A name=haskell>") $
drop 5 $ dropWhile (~/= "<A name=current>") tags
putStr $ unlines links
where
f :: [Tag String] -> String
f = dequote . unwords . words . fromTagText . head . filter isTagText
dequote ('\"':xs) | last xs == '\"' = init xs
dequote x = x
ndmPapers :: IO ()
ndmPapers = do
tags <- fmap parseTags $ openItem "http://community.haskell.org/~ndm/downloads/"
let papers = map f $ sections (~== "<li class=paper>") tags
putStr $ unlines papers
where
f :: [Tag String] -> String
f xs = fromTagText (xs !! 2)
currentTime :: IO ()
currentTime = do
tags <- fmap parseTags $ openItem "http://www.timeanddate.com/worldclock/city.html?n=136"
let res = fromTagText (dropWhile (~/= "<strong id=ct>") tags !! 1)
putStrLn res
type Section = String
data Package = Package {name :: String, desc :: String, href :: String}
deriving Show
hackage :: IO [(Section,[Package])]
hackage = do
tags <- fmap parseTags $ openItem "http://hackage.haskell.org/packages/archive/pkg-list.html"
return $ map parseSect $ partitions (~== "<h3>") tags
where
parseSect xs = (nam, packs)
where
nam = fromTagText $ xs !! 2
packs = map parsePackage $ partitions (~== "<li>") xs
parsePackage xs =
Package
(fromTagText $ xs !! 2)
(drop 2 $ dropWhile (/= ':') $ fromTagText $ xs !! 4)
(fromAttrib "href" $ xs !! 1)
-- rssCreators Example: prints names of story contributors on
-- sequence.complete.org. This content is RSS (not HTML), and the selected
-- tag uses a different XML namespace "dc:creator".
rssCreators :: IO ()
rssCreators = do
tags <- fmap parseTags $ openItem "http://sequence.complete.org/node/feed"
putStrLn $ unlines $ map names $ partitions (~== "<dc:creator>") tags
where names xs = fromTagText $ xs !! 1
validate :: String -> IO ()
validate x = putStr . unlines . g . f . parseTagsOptions opts =<< openItem x
where
opts = parseOptions{optTagPosition=True, optTagWarning=True}
f :: [Tag String] -> [String]
f (TagPosition row col:TagWarning warn:rest) =
("Warning (" ++ show row ++ "," ++ show col ++ "): " ++ warn) : f rest
f (TagWarning warn:rest) =
("Warning (?,?): " ++ warn) : f rest
f (_:rest) = f rest
f [] = []
g xs = xs ++ [if n == 0 then "Success, no warnings"
else "Failed, " ++ show n ++ " warning" ++ ['s'|n>1]]
where n = length xs
|