File: UnZip.hs

package info (click to toggle)
haskell-zip-stream 0.2.2.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 104 kB
  • sloc: haskell: 709; makefile: 7
file content (239 lines) | stat: -rw-r--r-- 9,521 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
-- |Stream the extraction of a zip file, e.g., as it's being downloaded.
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE RankNTypes #-}
module Codec.Archive.Zip.Conduit.UnZip
  ( unZipStream
  , ZipEntry(..)
  , ZipInfo(..)
  ) where

import           Control.Applicative ((<|>), empty)
import           Control.Monad (when, unless, guard)
import           Control.Monad.Catch (MonadThrow)
import           Control.Monad.Primitive (PrimMonad)
import qualified Data.Binary.Get as G
import           Data.Bits ((.&.), testBit, clearBit, shiftL, shiftR)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as BSC
import qualified Data.Conduit as C
import qualified Data.Conduit.Combinators as CC
import           Data.Conduit.Serialization.Binary (sinkGet)
import qualified Data.Conduit.Zlib as CZ
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import           Data.Time (LocalTime(..), TimeOfDay(..), fromGregorian)
import           Data.Word (Word16, Word32, Word64)

import           Codec.Archive.Zip.Conduit.Types
import           Codec.Archive.Zip.Conduit.Internal

data Header m
  = FileHeader
    { fileDecompress :: C.ConduitM BS.ByteString BS.ByteString m ()
    , fileEntry :: !ZipEntry
    , fileCRC :: !Word32
    , fileCSize :: !Word64
    , fileZip64 :: !Bool
    }
  | EndOfCentralDirectory
    { endInfo :: ZipInfo
    }

data ExtField = ExtField
  { extZip64 :: Bool
  , extZip64USize
  , extZip64CSize :: Word64
  }

{- ExtUnix
  { extUnixATime
  , extUnixMTime :: UTCTime
  , extUnixUID
  , extUnixGID :: Word16
  , extUnixData :: BS.ByteString
  }
-}

pass :: (MonadThrow m, Integral n) => n -> C.ConduitM BS.ByteString BS.ByteString m ()
pass 0 = return ()
pass n = C.await >>= maybe
  (zipError $ "EOF in file data, expecting " ++ show ni ++ " more bytes")
  (\b ->
    let n' = ni - toInteger (BS.length b) in
    if n' < 0
      then do
        let (b', r) = BS.splitAt (fromIntegral n) b
        C.yield b'
        C.leftover r
      else do
        C.yield b
        pass n')
  where ni = toInteger n

foldGet :: (a -> G.Get a) -> a -> G.Get a
foldGet g z = do
  e <- G.isEmpty
  if e then return z else g z >>= foldGet g

fromDOSTime :: Word16 -> Word16 -> LocalTime
fromDOSTime time date = LocalTime
  (fromGregorian
    (fromIntegral $ date `shiftR` 9 + 1980)
    (fromIntegral $ date `shiftR` 5 .&. 0x0f)
    (fromIntegral $ date            .&. 0x1f))
  (TimeOfDay
    (fromIntegral $ time `shiftR` 11)
    (fromIntegral $ time `shiftR` 5 .&. 0x3f)
    (fromIntegral $ time `shiftL` 1 .&. 0x3f))

-- |Stream process a zip file, producing a sequence of entry headers and data blocks.
-- For example, this might produce: @Left (ZipEntry "directory\/" ...), Left (ZipEntry "directory\/file.txt" ...), Right "hello w", Right "orld!\\n", Left ...@
-- The final result is summary information taken from the end of the zip file.
-- No state is maintained during processing, and, in particular, any information in the central directory is discarded.
--
-- This only supports a limited number of zip file features, including deflate compression and zip64.
-- It does not (ironically) support uncompressed zip files that have been created as streams, where file sizes are not known beforehand.
-- Since it does not use the offset information at the end of the file, it assumes all entries are packed sequentially, which is usually the case.
-- Any errors are thrown in the underlying monad (as 'ZipError's or 'Data.Conduit.Serialization.Binary.ParseError').
unZipStream ::
  ( MonadThrow m
  , PrimMonad m
  ) => C.ConduitM BS.ByteString (Either ZipEntry BS.ByteString) m ZipInfo
unZipStream = next where
  next = do -- local header, or start central directory
    h <- sinkGet $ do
      sig <- G.getWord32le
      case sig of
        0x04034b50 -> fileHeader
        _ -> centralBody sig
    case h of
      FileHeader{..} -> do
        C.yield $ Left fileEntry
        r <- C.mapOutput Right $
          case zipEntrySize fileEntry of
            Nothing -> do -- unknown size
              (csize, (size, crc)) <- inputSize fileDecompress `C.fuseBoth` sizeCRC
              -- traceM $ "csize=" ++ show csize ++ " size=" ++ show size ++ " crc=" ++ show crc
              -- required data description
              sinkGet $ dataDesc h
                { fileCSize = csize
                , fileCRC = crc
                , fileEntry = fileEntry
                  { zipEntrySize = Just size
                  }
                }
            Just usize -> do -- known size
              (size, crc) <- pass fileCSize
                C..| (fileDecompress >> CC.sinkNull)
                C..| sizeCRC
              -- traceM $ "size=" ++ show size ++ "," ++ show (zipEntrySize fileEntry) ++ " crc=" ++ show crc ++ "," ++ show fileCRC
              -- optional data description (possibly ambiguous!)
              sinkGet $ (guard =<< dataDesc h) <|> return ()
              return (size == usize && crc == fileCRC)
        unless r $ zipError $ either T.unpack BSC.unpack (zipEntryName fileEntry) ++ ": data integrity check failed"
        next
      EndOfCentralDirectory{..} -> do
        return endInfo
  dataDesc h = -- this takes a bit of flexibility to account for the various cases
    (do -- with signature
      sig <- G.getWord32le
      guard (sig == 0x08074b50)
      dataDescBody h)
    <|> dataDescBody h -- without signature
  dataDescBody FileHeader{..} = do
    crc <- G.getWord32le
    let getSize = if fileZip64 then G.getWord64le else fromIntegral <$> G.getWord32le
    csiz <- getSize
    usiz <- getSize
    -- traceM $ "crc=" ++ show crc ++ "," ++ show fileCRC ++ " csiz=" ++ show csiz ++ "," ++ show fileCSize ++ " usiz=" ++ show usiz ++ "," ++ show (zipEntrySize fileEntry)
    return $ crc == fileCRC && csiz == fileCSize && (usiz ==) `all` zipEntrySize fileEntry
  dataDescBody _ = empty
  central = G.getWord32le >>= centralBody
  centralBody 0x02014b50 = centralHeader >> central
  centralBody 0x06064b50 = zip64EndDirectory >> central
  centralBody 0x07064b50 = G.skip 16 >> central
  centralBody 0x06054b50 = EndOfCentralDirectory <$> endDirectory
  centralBody sig = fail $ "Unknown header signature: " ++ show sig
  fileHeader = do
    ver <- G.getWord8
    _os <- G.getWord8 -- OS Version (could require 0 = DOS, but we ignore ext attrs altogether)
    when (ver > zipVersion) $ fail $ "Unsupported version: " ++ show ver
    gpf <- G.getWord16le
    -- when (gpf .&. complement (bit 1 .|. bit 2 .|. bit 3) /= 0) $ fail $ "Unsupported flags: " ++ show gpf
    when (gpf `clearBit` 1 `clearBit` 2 `clearBit` 3 `clearBit` 11 /= 0) $ fail $ "Unsupported flags: " ++ show gpf
    comp <- G.getWord16le
    dcomp <- case comp of
      0 | testBit gpf 3 -> fail "Unsupported uncompressed streaming file data"
        | otherwise -> return idConduit
      8 -> return $ CZ.decompress deflateWindowBits
      _ -> fail $ "Unsupported compression method: " ++ show comp
    time <- fromDOSTime <$> G.getWord16le <*> G.getWord16le
    crc <- G.getWord32le
    csiz <- G.getWord32le
    usiz <- G.getWord32le
    nlen <- fromIntegral <$> G.getWord16le
    elen <- fromIntegral <$> G.getWord16le
    name <- G.getByteString nlen
    let getExt ext = do
          t <- G.getWord16le
          z <- fromIntegral <$> G.getWord16le
          G.isolate z $ case t of
            0x0001 -> do
              -- the zip specs claim "the Local header MUST include BOTH" but "only if the corresponding field is set to 0xFFFFFFFF"
              usiz' <- if usiz == maxBound32 then G.getWord64le else return $ extZip64USize ext
              csiz' <- if csiz == maxBound32 then G.getWord64le else return $ extZip64CSize ext
              return ext
                { extZip64 = True
                , extZip64USize = usiz'
                , extZip64CSize = csiz'
                }
            {-
            0x000d -> do
              atim <- G.getWord32le
              mtim <- G.getWord32le
              uid <- G.getWord16le
              gid <- G.getWord16le
              dat <- G.getByteString $ z - 12
              return ExtUnix
                { extUnixATime = posixSecondsToUTCTime atim
                , extUnixMTime = posixSecondsToUTCTime mtim
                , extUnixUID = uid
                , extUnixGID = gid
                , extUnixData = dat
                }
            -}
            _ -> ext <$ G.skip z
    ExtField{..} <- G.isolate elen $ foldGet getExt ExtField
      { extZip64 = False
      , extZip64USize = fromIntegral usiz
      , extZip64CSize = fromIntegral csiz
      }
    return FileHeader
      { fileEntry = ZipEntry
        { zipEntryName = if testBit gpf 11 then Left (TE.decodeUtf8 name) else Right name
        , zipEntryTime = time
        , zipEntrySize = if testBit gpf 3 then Nothing else Just extZip64USize
        , zipEntryExternalAttributes = Nothing
        }
      , fileDecompress = dcomp
      , fileCSize = extZip64CSize
      , fileCRC = crc
      , fileZip64 = extZip64
      }
  centralHeader = do
    -- ignore everything
    G.skip 24
    nlen <- fromIntegral <$> G.getWord16le
    elen <- fromIntegral <$> G.getWord16le
    clen <- fromIntegral <$> G.getWord16le
    G.skip $ 12 + nlen + elen + clen
  zip64EndDirectory = do
    len <- G.getWord64le
    G.skip $ fromIntegral len -- would not expect to overflow...
  endDirectory = do
    G.skip 16
    clen <- fromIntegral <$> G.getWord16le
    comm <- G.getByteString clen
    return ZipInfo
      { zipComment = comm
      }