Source code for deephaven.experimental.s3

#
# Copyright (c) 2016-2025 Deephaven Data Labs and Patent Pending
#
from typing import Optional, Union
from warnings import warn

import jpy

from deephaven import DHError
from deephaven._wrapper import JObjectWrapper
from deephaven.time import DurationLike, to_j_duration

# If we move S3 to a permanent module, we should remove this try/except block and just import the types directly.
try:
    _JCredentials = jpy.get_type("io.deephaven.extensions.s3.Credentials")
    _JS3Instructions = jpy.get_type("io.deephaven.extensions.s3.S3Instructions")
except Exception:
    _JCredentials = None
    _JS3Instructions = None

"""
    This module is useful for reading from and writing to S3-compatible APIs.
    Importing this module requires the S3 specific deephaven extensions (artifact name deephaven-extensions-s3) to be
    included in the package. This is an opt-out functionality included by default. If not included, importing this
    module will fail to find the java types.
"""


[docs]class Credentials(JObjectWrapper):
    """
    Credentials object for authenticating with an S3 server.
    """
    j_object_type = _JCredentials

    def __init__(self, _j_object: jpy.JType):
        """
        Initializes the credentials object.

        Args:
            _j_object (Credentials): the Java credentials object.
        """
        self._j_object = _j_object

    @property
    def j_object(self) -> jpy.JType:
        return self._j_object

[docs]    @classmethod
    def resolving(cls) -> 'Credentials':
        """
        Default credentials provider used by Deephaven which resolves credentials in the following order:

        1. If a profile name, config file path, or credentials file path is provided via S3 Instructions, use the
        profile_name for loading the credentials from the config and credentials file and fail if none is found.

        2. Otherwise, use the default AWS SDK behavior that looks for credentials in this order: Java System Properties
        (`aws.accessKeyId` and `aws.secretAccessKey`), Environment Variables (`AWS_ACCESS_KEY_ID` and
        `AWS_SECRET_ACCESS_KEY`), Credential profiles file at the default location (~/.aws/credentials), or Instance
        profile credentials delivered through the Amazon EC2  metadata service. If still none found, fall back to
        anonymous credentials, which can only be used to read data with S3 policy set to allow anonymous access.

        Returns:
            Credentials: the credentials object.
        """
        return cls(_JCredentials.resolving())

[docs]    @classmethod
    def default(cls) -> 'Credentials':
        """
        Default credentials provider used by the AWS SDK that looks for credentials in this order:
        Java System Properties (`aws.accessKeyId` and `aws.secretAccessKey`), Environment Variables (`AWS_ACCESS_KEY_ID`
        and `AWS_SECRET_ACCESS_KEY`), Credential profiles file at the default location (~/.aws/credentials), and
        Instance profile credentials delivered through the Amazon EC2 metadata service.

        Returns:
            Credentials: the credentials object.
        """
        return cls(_JCredentials.defaultCredentials())

[docs]    @classmethod
    def basic(cls, access_key_id: str, secret_access_key: str) -> 'Credentials':
        """
        Basic credentials provider with the specified access key id and secret access key.

        Args:
            access_key_id (str): the access key id, used to identify the user.
            secret_access_key (str): the secret access key, used to authenticate the user.

        Returns:
            Credentials: the credentials object.
        """
        return cls(_JCredentials.basic(access_key_id, secret_access_key))

[docs]    @classmethod
    def anonymous(cls) -> 'Credentials':
        """
       Anonymous credentials provider, which can only be used to read data with S3 policy set to allow anonymous access.

        Returns:
            Credentials: the credentials object.
        """
        return cls(_JCredentials.anonymous())

[docs]    @classmethod
    def profile(cls) -> 'Credentials':
        """
        Use the profile name, config file path, or credentials file path from S3 Instructions for loading the
        credentials and fail if none found.

        Returns:
            Credentials: the credentials object.
        """
        return cls(_JCredentials.profile())


[docs]class S3Instructions(JObjectWrapper):
    """
    S3Instructions provides specialized instructions for reading from and writing to S3-compatible APIs.
    """

    j_object_type = _JS3Instructions or type(None)

    def __init__(self,
                 region_name: Optional[str] = None,
                 credentials: Optional[Credentials] = None,
                 max_concurrent_requests: Optional[int] = None,
                 read_ahead_count: Optional[int] = None,
                 fragment_size: Optional[int] = None,
                 connection_timeout: Optional[DurationLike] = None,
                 read_timeout: Optional[DurationLike] = None,
                 access_key_id: Optional[str] = None,
                 secret_access_key: Optional[str] = None,
                 anonymous_access: bool = False,
                 endpoint_override: Optional[str] = None,
                 write_part_size: Optional[int] = None,
                 num_concurrent_write_parts: Optional[int] = None,
                 profile_name: Optional[str] = None,
                 config_file_path: Optional[str] = None,
                 credentials_file_path: Optional[str] = None):

        """
        Initializes the instructions.

        Args:
            region_name (str): the region name for reading parquet files. If not provided, the default region will be
                picked by the AWS SDK from 'aws.region' system property, "AWS_REGION" environment variable, the
                {user.home}/.aws/credentials or {user.home}/.aws/config files, or from EC2 metadata service, if running
                in EC2. If no region name is derived from the above chain or the derived region name is incorrect for
                the bucket accessed, the correct region name will be derived internally, at the cost of one additional
                request.
            credentials (Credentials): the credentials object for authenticating to the S3 server, defaults to
                Credentials.resolving().
            max_concurrent_requests (int): the maximum number of concurrent requests for reading files, default is 256.
            read_ahead_count (int): the number of fragments to send asynchronous read requests for while reading the
                current fragment. Defaults to 32, which means fetch the next 32 fragments in advance when reading the
                current fragment.
            fragment_size (int): the maximum size of each fragment to read in bytes, defaults to 65536. If
                there are fewer bytes remaining in the file, the fetched fragment can be smaller.
            connection_timeout (DurationLike): the amount of time to wait when initially establishing a connection
                before giving up and timing out. Can be expressed as an integer in nanoseconds, a time interval string,
                e.g. "PT00:00:00.001" or "PT1s", or other time duration types. Default to 2 seconds.
            read_timeout (DurationLike): the amount of time to wait when reading a fragment before giving up and timing
                out. Can be expressed as an integer in nanoseconds, a time interval string, e.g. "PT00:00:00.001" or
                "PT1s", or other time duration types. Default to 2 seconds.
            access_key_id (str):  (Deprecated) the access key for reading files. Both access key and secret access key
                must be provided to use static credentials. If you specify both access key and secret key, then you
                cannot provide other credentials like setting anonymous_access or credentials argument.
                This option is deprecated and should be replaced by setting credentials as
                Credentials.basic(access_key_id, secret_access_key).
            secret_access_key (str): (Deprecated) the secret access key for reading files. Both access key and secret
                key must be provided to use static credentials.  If you specify both access key and secret key, then you
                cannot provide other credentials like setting anonymous_access or credentials argument.
                This option is deprecated and should be replaced by setting credentials as
                Credentials.basic(access_key_id, secret_access_key).
            anonymous_access (bool): (Deprecated) use anonymous credentials, this is useful when the S3 policy has been
                set to allow anonymous access. By default, is False. If you set this to True, you cannot provide other
                credentials like setting access_key_id or credentials argument.
                This option is deprecated and should be replaced by setting credentials as Credentials.anonymous().
            endpoint_override (str): the endpoint to connect to. Callers connecting to AWS do not typically need to set
                this; it is most useful when connecting to non-AWS, S3-compatible APIs.
            write_part_size (int): The part or chunk size when writing to S3. The default is 10 MiB. The minimum allowed
                part size is 5242880. Higher part size may increase throughput but also increase memory usage. Writing
                a single file to S3 can be done in a maximum of 10,000 parts, so the maximum size of a single file that
                can be written is about 98 GiB for the default part size.
            num_concurrent_write_parts (int): the maximum number of parts or chunks that can be uploaded concurrently
                when writing to S3 without blocking, defaults to 64. Setting a higher value may increase throughput, but
                may also increase memory usage.
            profile_name (str): the profile name used for configuring the default region, credentials, etc., when
                reading or writing to S3. If not provided, the AWS SDK picks the profile name from the 'aws.profile'
                system property, the "AWS_PROFILE" environment variable, or defaults to the string "default".
                Setting a profile name assumes that the credentials are provided via this profile; if that is not the
                case, you must explicitly set credentials using the access_key_id and secret_access_key.
            config_file_path (str): the path to the configuration file to use for configuring the default region,
                role_arn, output etc. when reading or writing to S3. If not provided, the AWS SDK picks the configuration
                file from the 'aws.configFile' system property, the "AWS_CONFIG_FILE" environment variable, or defaults
                to "{user.home}/.aws/config".
                Setting a configuration file path assumes that the credentials are provided via the config and
                credentials files; if that is not the case, you must explicitly set credentials using the access_key_id
                and secret_access_key.
                For reference on the configuration file format, check
                https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html
            credentials_file_path (str): the path to the credentials file to use for configuring the credentials,
                region, etc. when reading or writing to S3. If not provided, the AWS SDK picks the credentials file from
                the 'aws.credentialsFile' system property, the "AWS_CREDENTIALS_FILE" environment variable, or defaults
                to "{user.home}/.aws/credentials".
                Setting a credentials file path assumes that the credentials are provided via the config and
                credentials files; if that is not the case, you must explicitly set credentials using the access_key_id
                and secret_access_key.
                The main difference between config_file_path and credentials_file_path is around the conventions used
                in the files. For reference on the credentials file format, check
                https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html

        Raises:
            DHError: If unable to build the instructions object.
        """

        if not _JS3Instructions or not _JCredentials:
            raise DHError(message="S3Instructions requires the S3 specific deephaven extensions to be included in "
                                  "the package")

        try:
            builder = self.j_object_type.builder()

            if region_name is not None:
                builder.regionName(region_name)

            if max_concurrent_requests is not None:
                builder.maxConcurrentRequests(max_concurrent_requests)

            if read_ahead_count is not None:
                builder.readAheadCount(read_ahead_count)

            if fragment_size is not None:
                builder.fragmentSize(fragment_size)

            if connection_timeout is not None:
                builder.connectionTimeout(to_j_duration(connection_timeout))

            if read_timeout is not None:
                builder.readTimeout(to_j_duration(read_timeout))

            if ((access_key_id is not None and secret_access_key is None) or
                    (access_key_id is None and secret_access_key is not None)):
                raise DHError("Either both access_key_id and secret_access_key must be provided or neither")

            def throw_multiple_credentials_error(credentials1: str, credentials2: str):
                raise DHError(f"Only one set of credentials can be set, but found {credentials1} and {credentials2}")

            # Configure the credentials
            if access_key_id is not None:
                warn('access_key_id is deprecated, prefer setting credentials as '
                     'Credentials.basic(access_key_id, secret_access_key)', DeprecationWarning, stacklevel=2)
                # TODO(deephaven-core#6165): Delete deprecated parameters
                if anonymous_access:
                    throw_multiple_credentials_error("access_key_id", "anonymous_access")
                if credentials is not None:
                    throw_multiple_credentials_error("access_key_id", "credentials")
                builder.credentials(_JCredentials.basic(access_key_id, secret_access_key))
            elif anonymous_access:
                warn("anonymous_access is deprecated, prefer setting credentials as Credentials.anonymous()",
                     DeprecationWarning, stacklevel=2)
                # TODO(deephaven-core#6165): Delete deprecated parameters
                if credentials is not None:
                    throw_multiple_credentials_error("anonymous_access", "credentials")
                builder.credentials(_JCredentials.anonymous())
            elif credentials is not None:
                builder.credentials(credentials.j_object)

            if endpoint_override is not None:
                builder.endpointOverride(endpoint_override)

            if write_part_size is not None:
                builder.writePartSize(write_part_size)

            if num_concurrent_write_parts is not None:
                builder.numConcurrentWriteParts(num_concurrent_write_parts)

            if profile_name is not None:
                builder.profileName(profile_name)

            if config_file_path is not None:
                builder.configFilePath(config_file_path)

            if credentials_file_path is not None:
                builder.credentialsFilePath(credentials_file_path)

            self._j_object = builder.build()
        except Exception as e:
            raise DHError(e, "Failed to build S3 instructions") from e

    @property
    def j_object(self) -> jpy.JType:
        return self._j_object