File: storage_options.py

package info (click to toggle)
universal-pathlib 0.3.10-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,656 kB
  • sloc: python: 20,552; makefile: 5
file content (440 lines) | stat: -rw-r--r-- 14,471 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
"""Storage options types for various filesystems based on fsspec."""

from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TypedDict

if TYPE_CHECKING:
    from asyncio import AbstractEventLoop
    from typing import Any
    from typing import Literal

    from fsspec.implementations.cache_mapper import AbstractCacheMapper
    from fsspec.spec import AbstractFileSystem


__all__ = [
    "SimpleCacheStorageOptions",
    "GCSStorageOptions",
    "S3StorageOptions",
    "AzureStorageOptions",
    "HfStorageOptions",
    "DataStorageOptions",
    "FTPStorageOptions",
    "GitHubStorageOptions",
    "HDFSStorageOptions",
    "HTTPStorageOptions",
    "FileStorageOptions",
    "MemoryStorageOptions",
    "SFTPStorageOptions",
    "SMBStorageOptions",
    "WebdavStorageOptions",
    "ZipStorageOptions",
    "TarStorageOptions",
]


class _AbstractStorageOptions(TypedDict, total=False):
    """Base storage options for fsspec-based filesystems"""

    # dircache related options
    use_listings_cache: bool
    listings_expiry_time: int | float | None
    max_paths: int | None
    # fs instance cache options
    skip_instance_cache: bool
    # async fs instance options
    asynchronous: bool
    loop: AbstractEventLoop | None
    batch_size: int | None


class _ChainableStorageOptions(TypedDict, total=False):
    """Storage options for filesystems supporting chaining"""

    target_protocol: str | None
    target_options: dict[str, Any] | None
    fs: AbstractFileSystem | None


class SimpleCacheStorageOptions(
    _AbstractStorageOptions,
    _ChainableStorageOptions,
    total=False,
):
    """Storage options for SimpleCache"""

    cache_storage: Literal["TMP"] | list[str] | str
    cache_check: int | Literal[False]
    check_files: bool
    expiry_time: int | Literal[False]
    same_names: bool | None
    compression: str  # todo: specify allowed values
    cache_mapper: AbstractCacheMapper | None


class GCSStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for Google Cloud Storage"""

    # Authentication and project settings
    project: str
    access: Literal["read_only", "read_write", "full_control"]
    token: (
        None
        | Literal["google_default", "cache", "anon", "browser", "cloud"]
        | str
        | dict[str, Any]
    )

    # Performance and caching
    block_size: int | None
    consistency: Literal["none", "size", "md5"]
    cache_timeout: float | None  # overrides listings_expiry_time if set

    # Request configuration
    requests_timeout: float | None
    requester_pays: bool | str
    session_kwargs: dict[str, Any] | None  # aiohttp.ClientSession kwargs
    timeout: float | None  # timeout used for .buckets?

    # Connection settings
    endpoint_url: str | None
    check_connection: bool | None

    # Storage configuration
    default_location: str | None
    version_aware: bool

    # Deprecated options
    # secure_serialize: bool


class S3StorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for AWS S3 and S3-compatible services"""

    # Authentication
    anon: bool
    key: str | None
    secret: str | None
    token: str | None
    username: str | None  # alias for key
    password: str | None  # alias for secret

    # Connection settings
    endpoint_url: str | None
    use_ssl: bool

    # AWS-specific configuration
    client_kwargs: dict[str, Any] | None  # botocore Client kwargs
    config_kwargs: dict[str, Any] | None  # botocore Config kwargs
    s3_additional_kwargs: dict[str, Any] | None  # s3 api methods kwargs
    session: Any | None  # aiobotocore AioSession

    # Performance settings
    requester_pays: bool
    default_block_size: int | None
    default_fill_cache: bool
    default_cache_type: str  # fsspec.caching Literal["readahead", "none", "bytes", ...]
    max_concurrency: int
    fixed_upload_size: bool

    # Feature flags
    version_aware: bool
    cache_regions: bool


class AzureStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for Azure Blob Storage and Azure Data Lake Gen2"""

    # Account and authentication
    account_name: str | None
    account_key: str | None
    connection_string: str | None
    credential: (
        str | Any | None
    )  # azure.core.credentials_async.AsyncTokenCredential or SAS token
    sas_token: str | None
    anon: bool | None

    # Service Principal authentication
    client_id: str | None
    client_secret: str | None
    tenant_id: str | None

    # Connection and networking
    # request_session: Any | None  # for http requests not used by anything ???
    # socket_timeout: int | None  deprecated
    account_host: str | None
    location_mode: Literal["primary", "secondary"]

    # Performance settings
    blocksize: int  # block size for download/upload operations
    default_fill_cache: bool
    default_cache_type: str  # fsspec cache type
    max_concurrency: int | None

    # Timeout settings
    timeout: int | None  # server-side timeout for operations
    connection_timeout: int | None  # connection establishment timeout
    read_timeout: int | None  # read operation timeout

    # Feature flags
    version_aware: bool  # support blob versioning
    assume_container_exists: bool | None  # container existence assumptions


class HfStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for Hugging face filesystem"""

    # Authentication
    token: str | None

    # Connection settings
    endpoint: str | None

    # Performance settings
    block_size: (
        int | None
    )  # Block size for reading bytes; 0 = raw requests file-like objects


class DataStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for Data URIs filesystem"""

    # No specific options for Data URIs at the moment


class FTPStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for FTP filesystem"""

    # Connection settings
    host: str  # The remote server name/ip to connect to (required)
    port: int  # Port to connect with (default: 21)

    # Authentication
    username: (
        str | None
    )  # User's identifier for authentication (anonymous if not given)
    password: str | None  # User's password on the server
    acct: str | None  # Account string for authentication (some servers require this)

    # Performance settings
    block_size: int | None  # Read-ahead or write buffer size

    # FTP-specific settings
    tempdir: (
        str | None
    )  # Directory on remote to put temporary files when in a transaction
    timeout: int  # Timeout of the FTP connection in seconds (default: 30)
    encoding: str  # Encoding for dir and filenames in FTP connection (default: "utf-8")

    # Security settings
    tls: bool  # Use FTP-TLS (default: False)


class GitHubStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for GitHub repository filesystem"""

    # Repository identification
    org: str  # GitHub organization or username
    repo: str  # Repository name
    sha: str | None  # Commit SHA, branch, or tag (default: current master)

    # Authentication
    username: str | None  # GitHub username for authenticated access
    token: str | None  # GitHub personal access token

    # Connection settings
    timeout: tuple[int, int] | int | None  # (connect, read) timeouts or single timeout


class HDFSStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for Hadoop Distributed File System (HDFS)"""

    # Connection settings
    host: str  # Hostname, IP or "default" to try to read from Hadoop config
    port: int  # Port to connect on, or default from Hadoop config if 0

    # Authentication
    user: str | None  # Username to connect as
    kerb_ticket: str | None  # Kerberos ticket for authentication

    # HDFS-specific settings
    replication: int  # Replication factor for write operations (default: 3)
    extra_conf: (
        dict[str, Any] | None
    )  # Additional configuration passed to HadoopFileSystem


class HTTPStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for HTTP(S) filesystem"""

    # Performance settings
    block_size: (
        int | None
    )  # Block size for reading bytes; 0 = raw requests file-like objects

    # Link parsing behavior
    simple_links: (
        bool  # Consider both HTML <a> tags and URL-like strings vs HTML tags only
    )
    same_scheme: (
        bool  # Only consider paths with matching http/https scheme during ls/glob
    )

    # Caching configuration
    cache_type: str  # Default cache type used in open() (e.g., "bytes")
    cache_options: dict[str, Any] | None  # Default cache options used in open()

    # HTTP client configuration
    client_kwargs: dict[str, Any] | None  # Passed to aiohttp.ClientSession
    get_client: Any | None  # Callable that constructs aiohttp.ClientSession

    # Encoding settings
    encoded: bool  # Whether URLs should be encoded

    # Deprecated options
    # size_policy: Any  # Deprecated parameter


class FileStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for local filesystem (file:// and local:// protocols)"""

    # File system behavior
    auto_mkdir: bool  # Whether to create parent directories when opening files


class MemoryStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for memory filesystem"""

    # No specific options for memory filesystem at the moment


class SFTPStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for SFTP/SSH filesystem"""

    # Connection settings
    host: str  # Hostname or IP address (required)
    port: int | None  # SSH port (default: 22)

    # Authentication
    username: str | None  # Username to authenticate as
    password: (
        str | None
    )  # Password authentication; also used for private key decryption
    passphrase: str | None  # Used for decrypting private keys
    pkey: Any | None  # Private key for authentication (paramiko.PKey)
    key_filename: str | list[str] | None  # Filename(s) of private key(s)/certs to try

    # Connection behavior
    timeout: float | None  # TCP connect timeout in seconds
    allow_agent: bool | None  # Whether to connect to SSH agent (default: True)
    look_for_keys: (
        bool | None
    )  # Whether to search for private keys in ~/.ssh/ (default: True)
    compress: bool | None  # Whether to turn on compression
    sock: Any | None  # Socket or socket-like object for communication

    # GSS-API authentication
    gss_auth: bool | None  # Use GSS-API authentication
    gss_kex: bool | None  # Perform GSS-API Key Exchange and user authentication
    gss_deleg_creds: bool | None  # Delegate GSS-API client credentials
    gss_host: str | None  # Target name in kerberos database (default: hostname)
    gss_trust_dns: bool | None  # Trust DNS to canonicalize hostname (default: True)

    # Timeout settings
    banner_timeout: float | None  # SSH banner timeout in seconds
    auth_timeout: float | None  # Authentication response timeout in seconds
    channel_timeout: float | None  # Channel open response timeout in seconds

    # Advanced configuration
    disabled_algorithms: (
        dict[str, Any] | None
    )  # Algorithms to disable (passed to Transport)
    transport_factory: Any | None  # Callable to generate Transport instance
    auth_strategy: Any | None  # AuthStrategy instance for newer authentication

    # SFTP-specific settings
    temppath: str  # Remote temporary directory path (default: "/tmp")


class SMBStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for SMB/Windows/Samba network shares"""

    # Connection settings
    host: str  # The remote server name/ip to connect to (required)
    port: int | None  # Port to connect with (usually 445, sometimes 139)

    # Authentication
    username: str | None  # Username to connect with (required if not using Kerberos)
    password: str | None  # User's password on the server

    # Connection behavior
    timeout: int  # Connection timeout in seconds (default: 60)
    encrypt: bool | None  # Whether to force encryption

    # File access control
    share_access: Literal["r", "w", "d"] | None  # Default access for file operations
    # None (default): exclusively locks file until closed
    # 'r': Allow other handles with read access
    # 'w': Allow other handles with write access
    # 'd': Allow other handles with delete access

    # Session retry configuration
    register_session_retries: int  # Number of session registration retries (default: 4)
    register_session_retry_wait: (
        int  # Wait time between retries in seconds (default: 1)
    )
    register_session_retry_factor: int  # Exponential backoff factor (default: 10)

    # File system behavior
    auto_mkdir: bool  # Whether to create parent directories when opening files


class WebdavStorageOptions(_AbstractStorageOptions, total=False):
    """Storage options for WebDAV filesystem"""

    # Connection settings
    base_url: str  # Base URL of the WebDAV server (required)

    # Authentication
    auth: (
        tuple[str, str] | Any | None
    )  # Authentication (username, password) tuple or custom auth

    # Client configuration
    client: Any | None  # webdav4.client.Client instance


class ZipStorageOptions(
    _AbstractStorageOptions,
    _ChainableStorageOptions,
    total=False,
):
    """Storage options for ZIP archive filesystem"""

    # Archive file settings
    fo: str | Any  # Path to ZIP file or file-like object
    mode: Literal["r", "w", "a"]  # Open mode: read, write, or append

    # ZIP compression settings
    compression: int  # Compression method (e.g., zipfile.ZIP_STORED, ZIP_DEFLATED)
    allowZip64: bool  # Enable ZIP64 extensions for large files
    compresslevel: int | None  # Compression level (None uses default for method)


class TarStorageOptions(
    _AbstractStorageOptions,
    _ChainableStorageOptions,
    total=False,
):
    """Storage options for TAR archive filesystem (read-only)"""

    # Archive file settings
    fo: str | Any  # Path to TAR file or file-like object
    # Compression settings
    compression: (
        str | None
    )  # Compression method: 'gzip', 'bz2', 'xz', or None for auto-detect
    index_store: str | None  # Path to store/load the file index cache