File: Zip.hs

package info (click to toggle)
haskell-zip-stream 0.2.2.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 104 kB
  • sloc: haskell: 709; makefile: 7
file content (276 lines) | stat: -rw-r--r-- 11,097 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
-- |Stream the creation of a zip file, e.g., as it's being uploaded.
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE BangPatterns #-}
module Codec.Archive.Zip.Conduit.Zip
  ( zipStream
  , ZipOptions(..)
  , ZipInfo(..)
  , defaultZipOptions
  , ZipEntry(..)
  , ZipData(..)
  , zipFileData
  ) where

import qualified Codec.Compression.Zlib.Raw as Z
import           Control.Arrow ((&&&), (+++), left)
import           Control.DeepSeq (force)
import           Control.Monad (when)
import           Control.Monad.Catch (MonadThrow)
import           Control.Monad.Primitive (PrimMonad)
import           Control.Monad.State.Strict (StateT, get)
import           Control.Monad.Trans.Resource (MonadResource)
import qualified Data.Binary.Put as P
import           Data.Bits (bit, shiftL, shiftR, (.|.))
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as BSC
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Conduit as C
import qualified Data.Conduit.Binary as CB
import qualified Data.Conduit.Combinators as CC
import           Data.Conduit.Lift (stateC, execStateC)
import           Data.Conduit.Serialization.Binary (sourcePut)
import qualified Data.Conduit.Zlib as CZ
import           Data.Digest.CRC32 (crc32)
import           Data.Either (isLeft)
import           Data.Maybe (fromMaybe, fromJust)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import           Data.Time (LocalTime(..), TimeOfDay(..), toGregorian)
import           Data.Word (Word16, Word32, Word64)

import           Codec.Archive.Zip.Conduit.Types
import           Codec.Archive.Zip.Conduit.Internal

-- |Options controlling zip file parameters and features
data ZipOptions = ZipOptions
  { zipOpt64 :: !Bool -- ^Allow 'ZipDataSource's over 4GB (reduces compatibility in some cases); this is automatically enabled for any files of known size (e.g., 'zipEntrySize')
  , zipOptCompressLevel :: !Int -- ^Compress zipped files (0 = store only, 1 = minimal, 9 = best; non-zero improves compatibility, since some unzip programs don't supported stored, streamed files, including the one in this package)
  , zipOptInfo :: !ZipInfo -- ^Other parameters to store in the zip file
  }

defaultZipOptions :: ZipOptions
defaultZipOptions = ZipOptions
  { zipOpt64 = False
  , zipOptCompressLevel = -1
  , zipOptInfo = ZipInfo
    { zipComment = BS.empty
    }
  }

infixr 7 ?*
(?*) :: Num a => Bool -> a -> a
True ?* x = x
False ?* _ = 0

-- |Use a file on disk as 'ZipData' (@'ZipDataSource' . 'CC.sourceFile'@).
zipFileData :: MonadResource m => FilePath -> ZipData m
zipFileData = ZipDataSource . CC.sourceFile

zipData :: Monad m => ZipData m -> Either (C.ConduitM () BS.ByteString m ()) BSL.ByteString
zipData (ZipDataByteString b) = Right b
zipData (ZipDataSource s) = Left s

dataSize :: Either a BSL.ByteString -> Maybe Word64
dataSize (Left _) = Nothing
dataSize (Right b) = Just $ fromIntegral $ BSL.length b

toDOSTime :: LocalTime -> (Word16, Word16)
toDOSTime (LocalTime (toGregorian -> (year, month, day)) (TimeOfDay hour mins secs)) =
  ( fromIntegral hour `shiftL` 11 .|. fromIntegral mins `shiftL` 5 .|. truncate secs `shiftR` 1
  , fromIntegral (year - 1980) `shiftL` 9 .|. fromIntegral month `shiftL` 5 .|. fromIntegral day
  )

countOutput :: Monad m => C.ConduitM i BS.ByteString m () -> C.ConduitM i BS.ByteString (StateT Word64 m) ()
countOutput c = stateC $ \s -> (,) () . (s +) <$> outputSize c

output :: MonadThrow m => P.Put -> C.ConduitM i BS.ByteString (StateT Word64 m) ()
output = countOutput . sourcePut

maxBound16 :: Integral n => n
maxBound16 = fromIntegral (maxBound :: Word16)

data CommonFileHeaderInfo = CommonFileHeaderInfo
  { cfhiIsStreamingEntry :: !Bool
  , cfhiHasUtf8Filename :: !Bool
  , cfhiIsCompressed :: !Bool
  , cfhiTime :: !Word16
  , cfhiDate :: !Word16
  } deriving (Eq, Ord, Show)

putCommonFileHeaderPart :: CommonFileHeaderInfo -> P.PutM ()
putCommonFileHeaderPart CommonFileHeaderInfo{..} = do
  P.putWord16le $ cfhiIsStreamingEntry ?* bit 3 .|. cfhiHasUtf8Filename ?* bit 11
  P.putWord16le $ cfhiIsCompressed ?* 8
  P.putWord16le $ cfhiTime
  P.putWord16le $ cfhiDate

-- | This is retained in memory until the end of the archive is written.
--
-- To avoid space leaks, this should contain only strict data.
data CentralDirectoryInfo = CentralDirectoryInfo
  { cdiOff :: !Word64
  , cdiZ64 :: !Bool
  , cdiCommonFileHeaderInfo :: !CommonFileHeaderInfo
  , cdiCrc :: !Word32
  , cdiUsz :: !Word64
  , cdiName :: !BSC.ByteString
  , cdiCsz :: !Word64
  , cdiZipEntryExternalAttributes :: !(Maybe Word32) -- lazy Maybe must be e.g. via `force` at creation
  } deriving (Eq, Ord, Show)

putCentralDirectory :: CentralDirectoryInfo -> P.PutM ()
putCentralDirectory CentralDirectoryInfo{..} = do
  -- central directory
  let o64 = cdiOff >= maxBound32
      l64 = cdiZ64 ?* 16 + o64 ?* 8
      a64 = cdiZ64 || o64
  P.putWord32le 0x02014b50
  P.putWord8 zipVersion
  P.putWord8 osVersion
  P.putWord8 $ if a64 then 45 else 20
  P.putWord8 osVersion
  putCommonFileHeaderPart cdiCommonFileHeaderInfo
  P.putWord32le cdiCrc
  P.putWord32le $ if cdiZ64 then maxBound32 else fromIntegral cdiCsz
  P.putWord32le $ if cdiZ64 then maxBound32 else fromIntegral cdiUsz
  P.putWord16le $ fromIntegral (BS.length cdiName)
  P.putWord16le $ a64 ?* (4 + l64)
  P.putWord16le 0 -- comment length
  P.putWord16le 0 -- disk number
  P.putWord16le 0 -- internal file attributes
  P.putWord32le $ fromMaybe 0 cdiZipEntryExternalAttributes
  P.putWord32le $ if o64 then maxBound32 else fromIntegral cdiOff
  P.putByteString cdiName
  when a64 $ do
    P.putWord16le 0x0001
    P.putWord16le l64
    when cdiZ64 $ do
      P.putWord64le cdiUsz
      P.putWord64le cdiCsz
    when o64 $
      P.putWord64le cdiOff

-- |Stream produce a zip file, reading a sequence of entries with data.
-- Although file data is never kept in memory (beyond a single 'ZipDataByteString'), the format of zip files requires producing a final directory of entries at the end of the file, consuming an additional ~100 bytes of state per entry during streaming.
-- The final result is the total size of the zip file.
--
-- Depending on options, the resulting zip file should be compatible with most unzipping applications.
-- Any errors are thrown in the underlying monad (as 'ZipError's).
zipStream ::
  ( MonadThrow m
  , PrimMonad m
  ) => ZipOptions -> C.ConduitM (ZipEntry, ZipData m) BS.ByteString m Word64
zipStream ZipOptions{..} = execStateC 0 $ do
  (cnt, cdir) <- next 0 (return ())
  cdoff <- get
  output cdir
  eoff <- get
  endDirectory cdoff (eoff - cdoff) cnt
  where
  next cnt dir = C.await >>= maybe
    (return (cnt, dir))
    (\e -> do
      d <- entry e
      next (succ cnt) $ dir >> d)
  entry (ZipEntry{..}, zipData -> dat) = do
    let usiz = dataSize dat
        sdat = left (\x -> C.toProducer x C..| sizeCRC) dat
        cfhiIsCompressed = zipOptCompressLevel /= 0
               && all (0 /=) usiz
               && all (0 /=) zipEntrySize
        cfhiIsStreamingEntry = isLeft dat
        compressPlainBs =
          Z.compressWith
            Z.defaultCompressParams
              { Z.compressLevel =
                  if zipOptCompressLevel == -1
                    then Z.defaultCompression
                    else Z.compressionLevel zipOptCompressLevel
              }
        (cdat, csiz)
          | cfhiIsCompressed =
            ( ((`C.fuseBoth` (outputSize $ CZ.compress zipOptCompressLevel deflateWindowBits))
              +++ compressPlainBs) sdat
            , dataSize cdat)
          | otherwise = (left (fmap (id &&& fst)) sdat, usiz)
        cdiZ64 = maybe (zipOpt64 || any (maxBound32 <) zipEntrySize)
          (maxBound32 <) (max <$> usiz <*> csiz)
        cfhiHasUtf8Filename = isLeft zipEntryName
        cdiName = either TE.encodeUtf8 id zipEntryName
        namelen = BS.length cdiName
        (cfhiTime, cfhiDate) = toDOSTime zipEntryTime
        mcrc = either (const Nothing) (Just . crc32) dat
        !cdiCommonFileHeaderInfo = CommonFileHeaderInfo{..}
    when (namelen > maxBound16) $ zipError $ either T.unpack BSC.unpack zipEntryName ++ ": entry name too long"
    cdiOff <- get
    output $ do
      P.putWord32le 0x04034b50
      P.putWord8 $ if cdiZ64 then 45 else 20
      P.putWord8 osVersion
      putCommonFileHeaderPart cdiCommonFileHeaderInfo
      P.putWord32le $ fromMaybe 0 mcrc
      P.putWord32le $ if cdiZ64 then maxBound32 else maybe 0 fromIntegral csiz
      P.putWord32le $ if cdiZ64 then maxBound32 else maybe 0 fromIntegral usiz
      P.putWord16le $ fromIntegral namelen
      P.putWord16le $ cdiZ64 ?* 20
      P.putByteString cdiName
      when cdiZ64 $ do
        P.putWord16le 0x0001
        P.putWord16le 16
        P.putWord64le $ fromMaybe 0 usiz
        P.putWord64le $ fromMaybe 0 csiz
    let outsz c = stateC $ \(!o) -> (id &&& (o +) . snd) <$> c
    ((cdiUsz, cdiCrc), cdiCsz) <- either
      (\cd -> do
        r@((usz, crc), csz) <- outsz cd -- write compressed data
        when (not cdiZ64 && (usz > maxBound32 || csz > maxBound32)) $ zipError $ either T.unpack BSC.unpack zipEntryName ++ ": file too large and zipOpt64 disabled"
        output $ do
          P.putWord32le 0x08074b50
          P.putWord32le crc
          let putsz
                | cdiZ64 = P.putWord64le
                | otherwise = P.putWord32le . fromIntegral
          putsz csz
          putsz usz
        return r)
      (\b -> outsz $ ((fromJust usiz, fromJust mcrc), fromJust csiz) <$ CB.sourceLbs b)
      cdat
    when (any (cdiUsz /=) zipEntrySize) $ zipError $ either T.unpack BSC.unpack zipEntryName ++ ": incorrect zipEntrySize"
    let !centralDirectoryInfo = CentralDirectoryInfo
          { cdiZipEntryExternalAttributes = force zipEntryExternalAttributes
          , .. }
    return $ putCentralDirectory centralDirectoryInfo

  endDirectory cdoff cdlen cnt = do
    let z64 = zipOpt64 || cdoff > maxBound32 || cnt > maxBound16
    when z64 $ output $ do
      P.putWord32le 0x06064b50 -- zip64 end
      P.putWord64le 44 -- length of this record
      P.putWord8 zipVersion
      P.putWord8 osVersion
      P.putWord8 45
      P.putWord8 osVersion
      P.putWord32le 0 -- disk
      P.putWord32le 0 -- central disk
      P.putWord64le cnt
      P.putWord64le cnt
      P.putWord64le cdlen
      P.putWord64le cdoff
      P.putWord32le 0x07064b50 -- locator:
      P.putWord32le 0 -- central disk
      P.putWord64le $ cdoff + cdlen
      P.putWord32le 1 -- total disks
    let comment = zipComment zipOptInfo
        commlen = BS.length comment
    when (commlen > maxBound16) $ zipError "comment too long"
    output $ do
      P.putWord32le 0x06054b50 -- end
      P.putWord16le 0 -- disk
      P.putWord16le 0 -- central disk
      P.putWord16le $ fromIntegral $ min maxBound16 cnt
      P.putWord16le $ fromIntegral $ min maxBound16 cnt
      P.putWord32le $ fromIntegral $ min maxBound32 cdlen
      P.putWord32le $ fromIntegral $ min maxBound32 cdoff
      P.putWord16le $ fromIntegral commlen
      P.putByteString comment