DataChain

The DataChain class creates a data chain, which is a sequence of data manipulation steps such as reading data from storages, running AI or LLM models or calling external services API to validate or enrich data. See DataChain for examples of how to create a chain.

C `module-attribute`

C = Column

ConnectionType `module-attribute`

ConnectionType = (
    str
    | sqlalchemy.engine.URL
    | sqlalchemy.engine.interfaces.Connectable
    | sqlalchemy.engine.Engine
    | sqlalchemy.engine.Connection
    | sqlalchemy.orm.Session
    | sqlite3.Connection
)

listings

listings(
    session: Session | None = None,
    in_memory: bool = False,
    column: str = "listing",
    **kwargs
) -> DataChain

Generate chain with list of cached listings. Listing is a special kind of dataset which has directory listing data of some underlying storage (e.g S3 bucket).

Example

import datachain as dc
dc.listings().show()

Source code in datachain/lib/dc/listings.py

def listings(
    session: Session | None = None,
    in_memory: bool = False,
    column: str = "listing",
    **kwargs,
) -> "DataChain":
    """Generate chain with list of cached listings.
    Listing is a special kind of dataset which has directory listing data of
    some underlying storage (e.g S3 bucket).

    Example:
        ```py
        import datachain as dc
        dc.listings().show()
        ```
    """
    session = Session.get(session, in_memory=in_memory)
    catalog = kwargs.get("catalog") or session.catalog

    return read_values(
        session=session,
        in_memory=in_memory,
        output={column: ListingInfo},
        **{column: catalog.listings()},  # type: ignore[arg-type]
    )

read_csv

read_csv(
    path: (
        str
        | PathLike[str]
        | list[str]
        | list[PathLike[str]]
    ),
    delimiter: str | None = None,
    header: bool = True,
    output: OutputType = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    nrows: int | None = None,
    session: Session | None = None,
    settings: dict | None = None,
    column_types: dict[str, str | DataType] | None = None,
    parse_options: (
        dict[str, str | bool | Callable] | None
    ) = None,
    **kwargs
) -> DataChain

Generate chain from csv files.

Parameters:

path (str | PathLike[str] | list[str] | list[PathLike[str]]) –

Storage URI with directory. URI must start with storage prefix such as s3://, gs://, az:// or "file:///".
delimiter (str | None, default: None ) –

Character for delimiting columns. Takes precedence if also specified in parse_options. Defaults to ",".
header (bool, default: True ) –

Whether the files include a header row.
output (OutputType, default: None ) –

Dictionary or feature class defining column names and their corresponding types. List of column names is also accepted, in which case types will be inferred.
column (str, default: '' ) –

Created column name.
model_name (str, default: '' ) –

Generated model name.
source (bool, default: True ) –

Whether to include info about the source file.
nrows (int | None, default: None ) –

Optional row limit.
session (Session | None, default: None ) –

Session to use for the chain.
settings (dict | None, default: None ) –

Settings to use for the chain.
column_types (dict[str, str | DataType] | None, default: None ) –

Dictionary of column names and their corresponding types. It is passed to CSV reader and for each column specified type auto inference is disabled.
parse_options (dict[str, str | bool | Callable] | None, default: None ) –

Tells the parser how to process lines. See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html

Example

Reading a csv file:

import datachain as dc
chain = dc.read_csv("s3://mybucket/file.csv")

Reading csv files from a directory as a combined dataset:

import datachain as dc
chain = dc.read_csv("s3://mybucket/dir")

Source code in datachain/lib/dc/csv.py

def read_csv(
    path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
    delimiter: str | None = None,
    header: bool = True,
    output: OutputType = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    nrows: int | None = None,
    session: Session | None = None,
    settings: dict | None = None,
    column_types: dict[str, "str | ArrowDataType"] | None = None,
    parse_options: dict[str, str | bool | Callable] | None = None,
    **kwargs,
) -> "DataChain":
    """Generate chain from csv files.

    Parameters:
        path: Storage URI with directory. URI must start with storage prefix such
            as `s3://`, `gs://`, `az://` or "file:///".
        delimiter: Character for delimiting columns. Takes precedence if also
            specified in `parse_options`. Defaults to ",".
        header: Whether the files include a header row.
        output: Dictionary or feature class defining column names and their
            corresponding types. List of column names is also accepted, in which
            case types will be inferred.
        column: Created column name.
        model_name: Generated model name.
        source: Whether to include info about the source file.
        nrows: Optional row limit.
        session: Session to use for the chain.
        settings: Settings to use for the chain.
        column_types: Dictionary of column names and their corresponding types.
            It is passed to CSV reader and for each column specified type auto
            inference is disabled.
        parse_options: Tells the parser how to process lines.
            See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html

    Example:
        Reading a csv file:
        ```py
        import datachain as dc
        chain = dc.read_csv("s3://mybucket/file.csv")
        ```

        Reading csv files from a directory as a combined dataset:
        ```py
        import datachain as dc
        chain = dc.read_csv("s3://mybucket/dir")
        ```
    """
    from pandas._libs.parsers import STR_NA_VALUES
    from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
    from pyarrow.dataset import CsvFileFormat
    from pyarrow.lib import type_for_alias

    from .storage import read_storage

    parse_options = parse_options or {}
    if "delimiter" not in parse_options:
        parse_options["delimiter"] = ","
    if delimiter:
        parse_options["delimiter"] = delimiter

    if column_types:
        column_types = {
            name: type_for_alias(typ) if isinstance(typ, str) else typ
            for name, typ in column_types.items()
        }
    else:
        column_types = {}

    chain = read_storage(path, session=session, settings=settings, **kwargs)

    column_names = None
    if not header:
        if not output:
            msg = "error parsing csv - provide output if no header"
            raise DatasetPrepareError(chain.name, msg)
        if isinstance(output, Sequence):
            column_names = output  # type: ignore[assignment]
        elif isinstance(output, dict):
            column_names = list(output.keys())
        elif (fr := ModelStore.to_pydantic(output)) is not None:
            column_names = list(fr.model_fields.keys())
        else:
            msg = f"error parsing csv - incompatible output type {type(output)}"
            raise DatasetPrepareError(chain.name, msg)

    parse_options = ParseOptions(**parse_options)
    read_options = ReadOptions(column_names=column_names)
    convert_options = ConvertOptions(
        strings_can_be_null=True,
        null_values=STR_NA_VALUES,
        column_types=column_types,
    )
    format = CsvFileFormat(
        parse_options=parse_options,
        read_options=read_options,
        convert_options=convert_options,
    )
    return chain.parse_tabular(
        output=output,
        column=column,
        model_name=model_name,
        source=source,
        nrows=nrows,
        format=format,
        parse_options=parse_options,
    )

read_dataset

read_dataset(
    name: str,
    namespace: str | None = None,
    project: str | None = None,
    version: str | int | None = None,
    session: Session | None = None,
    settings: dict | None = None,
    delta: bool | None = False,
    delta_on: str | Sequence[str] | None = (
        "file.path",
        "file.etag",
        "file.version",
    ),
    delta_result_on: str | Sequence[str] | None = None,
    delta_compare: str | Sequence[str] | None = None,
    delta_retry: bool | str | None = None,
    delta_unsafe: bool = False,
    update: bool = False,
) -> DataChain

Get data from a saved Dataset. It returns the chain itself. If dataset or version is not found locally, it will try to pull it from Studio.

Parameters:

name (str) –

The dataset name, which can be a fully qualified name including the namespace and project. Alternatively, it can be a regular name, in which case the explicitly defined namespace and project will be used if they are set; otherwise, default values will be applied. The name can also include a version using the name@version format (e.g. "my_dataset@1.0.0"). If version is also provided explicitly, it takes priority.
namespace (str | None, default: None ) –

optional name of namespace in which dataset to read is created
project (str | None, default: None ) –

optional name of project in which dataset to read is created
version (str | int | None, default: None ) –

dataset version. Supports: - Exact version strings: "1.2.3" - Legacy integer versions: 1, 2, 3 (finds latest major version) - Version specifiers (PEP 440): ">=1.0.0,<2.0.0", "~=1.4.2", "==1.2.*", etc.
session (Session | None, default: None ) –

Session to use for the chain.
settings (dict | None, default: None ) –

Settings to use for the chain.
delta (bool | None, default: False ) –

If True, only process new or changed files instead of reprocessing everything. This saves time by skipping files that were already processed in previous versions. The optimization is working when a new version of the dataset is created. Default is False.
delta_on (str | Sequence[str] | None, default: ('file.path', 'file.etag', 'file.version') ) –

Field(s) that uniquely identify each record in the source data. Used to detect which records are new or changed. Default is ("file.path", "file.etag", "file.version").
delta_result_on (str | Sequence[str] | None, default: None ) –

Field(s) in the result dataset that match delta_on fields. Only needed if you rename the identifying fields during processing. Default is None.
delta_compare (str | Sequence[str] | None, default: None ) –

Field(s) used to detect if a record has changed. If not specified, all fields except delta_on fields are used. Default is None.
delta_retry (bool | str | None, default: None ) –

Controls retry behavior for failed records: - String (field name): Reprocess records where this field is not empty (error mode) - True: Reprocess records missing from the result dataset (missing mode) - None: No retry processing (default)
update (bool, default: False ) –

If True always checks for newer versions available on Studio, even if some version of the dataset exists locally already. If False (default), it will only fetch the dataset from Studio if it is not found locally.
delta_unsafe (bool, default: False ) –

Allow restricted ops in delta: merge, union, subtract, diff, file_diff, agg, group_by, distinct. When multiple delta sources participate in one composed query, this must be enabled on every participating delta source.

Example

import datachain as dc
chain = dc.read_dataset("my_cats")

import datachain as dc
chain = dc.read_dataset("dev.animals.my_cats")

chain = dc.read_dataset("my_cats", version="1.0.0")

# Version can also be embedded in the name using the @ syntax
chain = dc.read_dataset("my_cats@1.0.0")

# Using version specifiers (PEP 440)
chain = dc.read_dataset("my_cats", version=">=1.0.0,<2.0.0")

# Legacy integer version support (finds latest in major version)
chain = dc.read_dataset("my_cats", version=1)  # Latest 1.x.x version

# Always check for newer versions matching a version specifier from Studio
chain = dc.read_dataset("my_cats", version=">=1.0.0", update=True)

session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
settings = {
    "cache": True,
    "parallel": 4,
    "workers": 4,
    "min_task_size": 1000,
    "prefetch": 10,
}
chain = dc.read_dataset(
    name="my_cats",
    version="1.0.0",
    session=session,
    settings=settings,
)

Source code in datachain/lib/dc/datasets.py

def read_dataset(
    name: str,
    namespace: str | None = None,
    project: str | None = None,
    version: str | int | None = None,
    session: Session | None = None,
    settings: dict | None = None,
    delta: bool | None = False,
    delta_on: str | Sequence[str] | None = (
        "file.path",
        "file.etag",
        "file.version",
    ),
    delta_result_on: str | Sequence[str] | None = None,
    delta_compare: str | Sequence[str] | None = None,
    delta_retry: bool | str | None = None,
    delta_unsafe: bool = False,
    update: bool = False,
) -> "DataChain":
    """Get data from a saved Dataset. It returns the chain itself.
    If dataset or version is not found locally, it will try to pull it from Studio.

    Parameters:
        name: The dataset name, which can be a fully qualified name including the
            namespace and project. Alternatively, it can be a regular name, in which
            case the explicitly defined namespace and project will be used if they are
            set; otherwise, default values will be applied. The name can also include
            a version using the ``name@version`` format (e.g. ``"my_dataset@1.0.0"``).
            If ``version`` is also provided explicitly, it takes priority.
        namespace: optional name of namespace in which dataset to read is created
        project: optional name of project in which dataset to read is created
        version: dataset version. Supports:
            - Exact version strings: "1.2.3"
            - Legacy integer versions: 1, 2, 3 (finds latest major version)
            - Version specifiers (PEP 440): ">=1.0.0,<2.0.0", "~=1.4.2", "==1.2.*", etc.
        session: Session to use for the chain.
        settings: Settings to use for the chain.
        delta: If True, only process new or changed files instead of reprocessing
            everything. This saves time by skipping files that were already processed in
            previous versions. The optimization is working when a new version of the
            dataset is created.
            Default is False.
        delta_on: Field(s) that uniquely identify each record in the source data.
            Used to detect which records are new or changed.
            Default is ("file.path", "file.etag", "file.version").
        delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
            Only needed if you rename the identifying fields during processing.
            Default is None.
        delta_compare: Field(s) used to detect if a record has changed.
            If not specified, all fields except `delta_on` fields are used.
            Default is None.
        delta_retry: Controls retry behavior for failed records:
            - String (field name): Reprocess records where this field is not empty
              (error mode)
            - True: Reprocess records missing from the result dataset (missing mode)
            - None: No retry processing (default)
        update: If True always checks for newer versions available on Studio, even if
            some version of the dataset exists locally already. If False (default), it
            will only fetch the dataset from Studio if it is not found locally.
        delta_unsafe: Allow restricted ops in delta: merge, union, subtract,
            diff, file_diff, agg, group_by, distinct. When multiple delta
            sources participate in one composed query, this must be enabled on
            every participating delta source.


    Example:
        ```py
        import datachain as dc
        chain = dc.read_dataset("my_cats")
        ```

        ```py
        import datachain as dc
        chain = dc.read_dataset("dev.animals.my_cats")
        ```

        ```py
        chain = dc.read_dataset("my_cats", version="1.0.0")
        ```

        ```py
        # Version can also be embedded in the name using the @ syntax
        chain = dc.read_dataset("my_cats@1.0.0")
        ```

        ```py
        # Using version specifiers (PEP 440)
        chain = dc.read_dataset("my_cats", version=">=1.0.0,<2.0.0")
        ```

        ```py
        # Legacy integer version support (finds latest in major version)
        chain = dc.read_dataset("my_cats", version=1)  # Latest 1.x.x version
        ```

        ```py
        # Always check for newer versions matching a version specifier from Studio
        chain = dc.read_dataset("my_cats", version=">=1.0.0", update=True)
        ```

        ```py
        session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
        settings = {
            "cache": True,
            "parallel": 4,
            "workers": 4,
            "min_task_size": 1000,
            "prefetch": 10,
        }
        chain = dc.read_dataset(
            name="my_cats",
            version="1.0.0",
            session=session,
            settings=settings,
        )
        ```
    """
    from datachain.telemetry import telemetry

    from .datachain import DataChain

    name, name_version = parse_dataset_with_version(name)
    if version is None:
        version = name_version

    telemetry.send_event_once("class", "datachain_init", name=name, version=version)

    session = Session.get(session)
    catalog = session.catalog

    namespace_name, project_name, name = catalog.get_full_dataset_name(
        name,
        project_name=project,
        namespace_name=namespace,
    )

    if version is not None:
        dataset = session.catalog.get_dataset_with_remote_fallback(
            name,
            namespace_name,
            project_name,
            update=update,
            include_incomplete=False,  # Never include incomplete datasets
        )

        # Convert legacy integer versions to version specifiers
        # For backward compatibility we still allow users to put version as integer
        # in which case we convert it to a version specifier that finds the latest
        # version where major part is equal to that input version.
        # For example if user sets version=2, we convert it to ">=2.0.0,<3.0.0"
        # which will find something like 2.4.3 (assuming 2.4.3 is the biggest among
        # all 2.* dataset versions)
        if isinstance(version, int):
            version_spec = f">={version}.0.0,<{version + 1}.0.0"
        else:
            version_spec = str(version)

        from packaging.specifiers import InvalidSpecifier, SpecifierSet

        try:
            # Try to parse as version specifier
            SpecifierSet(version_spec)
            # If it's a valid specifier set, find the latest compatible version
            latest_compatible = dataset.latest_compatible_version(version_spec)
            if not latest_compatible:
                raise DatasetVersionNotFoundError(
                    f"No dataset {name} version matching specifier {version_spec}"
                )
            version = latest_compatible
        except InvalidSpecifier:
            # If not a valid specifier, treat as exact version string
            # This handles cases like "1.2.3" which are exact versions, not specifiers
            pass

    if settings:
        _settings = Settings(**settings)
    else:
        _settings = Settings()

    query = DatasetQuery(
        name=name,
        project_name=project_name,
        namespace_name=namespace_name,
        version=version,  #  type: ignore[arg-type]
        session=session,
        update=update,
    )

    signals_schema = SignalSchema({"sys": Sys})
    if query.feature_schema:
        signals_schema |= SignalSchema.deserialize(query.feature_schema)
    else:
        signals_schema |= SignalSchema.from_column_types(query.column_types or {})

    if delta:
        signals_schema = signals_schema.clone_without_sys_signals()

    chain = DataChain(query, _settings, signals_schema)

    if delta:
        if delta_on is None:
            raise ValueError("'delta on' fields must be defined")
        query.delta_spec = DeltaSpec(
            on=delta_on,
            right_on=delta_result_on,
            compare=delta_compare,
            delta_retry=delta_retry,
            delta_unsafe=delta_unsafe,
        )

    return chain

read_hf

read_hf(
    dataset: HFDatasetType,
    *args: Any,
    session: Session | None = None,
    settings: dict | None = None,
    column: str = "",
    model_name: str = "",
    limit: int = 0,
    **kwargs: Any
) -> DataChain

Generate chain from Hugging Face Hub dataset.

Parameters:

dataset (HFDatasetType) –

Path or name of the dataset to read from Hugging Face Hub, or an instance of datasets.Dataset-like object.
args (Any, default: () ) –

Additional positional arguments to pass to datasets.load_dataset.
session (Session | None, default: None ) –

Session to use for the chain.
settings (dict | None, default: None ) –

Settings to use for the chain.
column (str, default: '' ) –

Generated object column name.
model_name (str, default: '' ) –

Generated model name.
limit (int, default: 0 ) –

The maximum number of items to read from the HF dataset. Applies take(limit) to datasets.load_dataset. Defaults to 0 (no limit).
kwargs (Any, default: {} ) –

Parameters to pass to datasets.load_dataset.

Example

Load from Hugging Face Hub:

import datachain as dc
chain = dc.read_hf("beans", split="train")

Generate chain from loaded dataset:

from datasets import load_dataset
ds = load_dataset("beans", split="train")
import datachain as dc
chain = dc.read_hf(ds)

Streaming with limit, for large datasets:

import datachain as dc
ds = dc.read_hf("beans", split="train", streaming=True, limit=10)

or use HF split syntax (not supported if streaming is enabled):

import datachain as dc
ds = dc.read_hf("beans", split="train[%10]")

Source code in datachain/lib/dc/hf.py

def read_hf(
    dataset: "HFDatasetType",
    *args: Any,
    session: Session | None = None,
    settings: dict | None = None,
    column: str = "",
    model_name: str = "",
    limit: int = 0,
    **kwargs: Any,
) -> "DataChain":
    """Generate chain from Hugging Face Hub dataset.

    Parameters:
        dataset: Path or name of the dataset to read from Hugging Face Hub,
            or an instance of `datasets.Dataset`-like object.
        args: Additional positional arguments to pass to `datasets.load_dataset`.
        session: Session to use for the chain.
        settings: Settings to use for the chain.
        column: Generated object column name.
        model_name: Generated model name.
        limit: The maximum number of items to read from the HF dataset.
            Applies `take(limit)` to `datasets.load_dataset`.
            Defaults to 0 (no limit).
        kwargs: Parameters to pass to `datasets.load_dataset`.

    Example:
        Load from Hugging Face Hub:
        ```py
        import datachain as dc
        chain = dc.read_hf("beans", split="train")
        ```

        Generate chain from loaded dataset:
        ```py
        from datasets import load_dataset
        ds = load_dataset("beans", split="train")
        import datachain as dc
        chain = dc.read_hf(ds)
        ```

        Streaming with limit, for large datasets:
        ```py
        import datachain as dc
        ds = dc.read_hf("beans", split="train", streaming=True, limit=10)
        ```

        or use HF split syntax (not supported if streaming is enabled):
        ```py
        import datachain as dc
        ds = dc.read_hf("beans", split="train[%10]")
        ```
    """
    from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits

    from .values import read_values

    output: dict[str, DataType] = {}
    ds_dict = stream_splits(dataset, *args, **kwargs)
    if len(ds_dict) > 1:
        output = {"split": str}

    model_name = model_name or column or ""
    hf_features = next(iter(ds_dict.values())).features
    hf_output, normalized_names = get_output_schema(hf_features, list(output.keys()))
    output = output | hf_output
    model = dict_to_data_model(model_name, output, list(normalized_names.values()))
    if column:
        output = {column: model}

    chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
    return chain.gen(HFGenerator(dataset, model, limit, *args, **kwargs), output=output)

read_json

read_json(
    path: str | PathLike[str],
    type: FileType = "text",
    spec: DataType | None = None,
    schema_from: str | None = "auto",
    jmespath: str | None = None,
    column: str | None = "",
    model_name: str | None = None,
    format: str | None = "json",
    nrows: int | None = None,
    **kwargs
) -> DataChain

Get data from JSON. It returns the chain itself.

Parameters:

path (str | PathLike[str]) –

storage URI with directory. URI must start with storage prefix such as s3://, gs://, az:// or "file:///"
type (FileType, default: 'text' ) –

read file as "binary", "text", or "image" data. Default is "text".
spec (DataType | None, default: None ) –

optional Data Model
schema_from (str | None, default: 'auto' ) –

path to sample to infer spec (if schema not provided)
column (str | None, default: '' ) –

generated column name
model_name (str | None, default: None ) –

optional generated model name
format (str | None, default: 'json' ) –

"json", "jsonl"
jmespath (str | None, default: None ) –

optional JMESPATH expression to reduce JSON
nrows (int | None, default: None ) –

optional row limit for jsonl and JSON arrays

Example

infer JSON schema from data, reduce using JMESPATH

import datachain as dc
chain = dc.read_json("gs://json", jmespath="key1.key2")

infer JSON schema from a particular path

import datachain as dc
chain = dc.read_json("gs://json_ds", schema_from="gs://json/my.json")

Source code in datachain/lib/dc/json.py

def read_json(
    path: str | os.PathLike[str],
    type: FileType = "text",
    spec: DataType | None = None,
    schema_from: str | None = "auto",
    jmespath: str | None = None,
    column: str | None = "",
    model_name: str | None = None,
    format: str | None = "json",
    nrows: int | None = None,
    **kwargs,
) -> "DataChain":
    """Get data from JSON. It returns the chain itself.

    Parameters:
        path: storage URI with directory. URI must start with storage prefix such
            as `s3://`, `gs://`, `az://` or "file:///"
        type: read file as "binary", "text", or "image" data. Default is "text".
        spec: optional Data Model
        schema_from: path to sample to infer spec (if schema not provided)
        column: generated column name
        model_name: optional generated model name
        format: "json", "jsonl"
        jmespath: optional JMESPATH expression to reduce JSON
        nrows: optional row limit for jsonl and JSON arrays

    Example:
        infer JSON schema from data, reduce using JMESPATH
        ```py
        import datachain as dc
        chain = dc.read_json("gs://json", jmespath="key1.key2")
        ```

        infer JSON schema from a particular path
        ```py
        import datachain as dc
        chain = dc.read_json("gs://json_ds", schema_from="gs://json/my.json")
        ```
    """
    from .storage import read_storage

    if schema_from == "auto":
        schema_from = os.fspath(path)

    def jmespath_to_name(s: str):
        name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]
        return s[:name_end]

    if (not column) and jmespath:
        column = jmespath_to_name(jmespath)
    if not column:
        column = format
    chain = read_storage(uri=path, type=type, **kwargs)
    signal_dict = {
        column: meta_formats.read_meta(
            schema_from=schema_from,
            format=format,
            spec=spec,
            model_name=model_name,
            jmespath=jmespath,
            nrows=nrows,
            session=kwargs.get("session"),
        ),
        "params": {"file": File},
    }
    # disable prefetch if nrows is set
    settings = {"prefetch": 0} if nrows else {}

    cloudpickle.register_pickle_by_value(meta_formats)

    return chain.settings(**settings).gen(**signal_dict)  # type: ignore[misc, arg-type]

read_pandas

read_pandas(
    df: DataFrame,
    name: str = "",
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    column: str = "",
) -> DataChain

Generate chain from pandas data-frame.

Example

import pandas as pd
import datachain as dc

df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
dc.read_pandas(df)

Source code in datachain/lib/dc/pandas.py

def read_pandas(  # type: ignore[override]
    df: "pd.DataFrame",
    name: str = "",
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    column: str = "",
) -> "DataChain":
    """Generate chain from pandas data-frame.

    Example:
        ```py
        import pandas as pd
        import datachain as dc

        df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
        dc.read_pandas(df)
        ```
    """
    from .utils import DatasetPrepareError

    def get_col_name(col):
        if isinstance(col, tuple):
            # Join tuple elements with underscore for MultiIndex columns
            return "_".join(map(str, col)).lower()
        # Handle regular string column names
        return str(col).lower()

    fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}

    for c in fr_map:
        if not c.isidentifier():
            raise DatasetPrepareError(
                name,
                f"import from pandas error - '{c}' cannot be a column name",
            )

    return read_values(
        name,
        session,
        settings=settings,
        column=column,
        in_memory=in_memory,
        **fr_map,
    )

read_parquet

read_parquet(
    path: (
        str
        | PathLike[str]
        | list[str]
        | list[PathLike[str]]
    ),
    partitioning: Any = "hive",
    output: dict[str, DataType] | None = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    session: Session | None = None,
    settings: dict | None = None,
    **kwargs
) -> DataChain

Generate chain from parquet files.

Parameters:

path (str | PathLike[str] | list[str] | list[PathLike[str]]) –

Storage path(s) or URI(s). Can be a local path or start with a storage prefix like s3://, gs://, az://, hf:// or "file:///". Supports glob patterns: - * : wildcard - ** : recursive wildcard - ? : single character - {a,b} : brace expansion list - {1..9} : brace numeric or alphabetic range
partitioning (Any, default: 'hive' ) –

Any pyarrow partitioning schema.
output (dict[str, DataType] | None, default: None ) –

Dictionary defining column names and their corresponding types.
column (str, default: '' ) –

Created column name.
model_name (str, default: '' ) –

Generated model name.
source (bool, default: True ) –

Whether to include info about the source file.
session (Session | None, default: None ) –

Session to use for the chain.
settings (dict | None, default: None ) –

Settings to use for the chain.

Example

Reading a single file:

import datachain as dc
dc.read_parquet("s3://mybucket/file.parquet")

All files from a directory:

dc.read_parquet("s3://mybucket/dir/")

Only parquet files from a directory, and all it's subdirectories:

dc.read_parquet("s3://mybucket/dir/**/*.parquet")

Using filename patterns - numeric, list, starting with zeros:

dc.read_parquet("s3://mybucket/202{1..4}/{yellow,green}-{01..12}.parquet")

Source code in datachain/lib/dc/parquet.py

def read_parquet(
    path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
    partitioning: Any = "hive",
    output: dict[str, DataType] | None = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    session: Session | None = None,
    settings: dict | None = None,
    **kwargs,
) -> "DataChain":
    """Generate chain from parquet files.

    Parameters:
        path: Storage path(s) or URI(s). Can be a local path or start with a
            storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
            Supports glob patterns:
              - `*` : wildcard
              - `**` : recursive wildcard
              - `?` : single character
              - `{a,b}` : brace expansion list
              - `{1..9}` : brace numeric or alphabetic range
        partitioning: Any pyarrow partitioning schema.
        output: Dictionary defining column names and their corresponding types.
        column: Created column name.
        model_name: Generated model name.
        source: Whether to include info about the source file.
        session: Session to use for the chain.
        settings: Settings to use for the chain.

    Example:
        Reading a single file:
        ```py
        import datachain as dc
        dc.read_parquet("s3://mybucket/file.parquet")
        ```

        All files from a directory:
        ```py
        dc.read_parquet("s3://mybucket/dir/")
        ```

        Only parquet files from a directory, and all it's subdirectories:
        ```py
        dc.read_parquet("s3://mybucket/dir/**/*.parquet")
        ```

        Using filename patterns - numeric, list, starting with zeros:
        ```py
        dc.read_parquet("s3://mybucket/202{1..4}/{yellow,green}-{01..12}.parquet")
        ```
    """
    from .storage import read_storage

    chain = read_storage(path, session=session, settings=settings, **kwargs)
    return chain.parse_tabular(
        output=output,
        column=column,
        model_name=model_name,
        source=source,
        format="parquet",
        partitioning=partitioning,
    )

read_records

read_records(
    to_insert: dict | Iterable[dict] | None,
    schema: dict[str, DataType],
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
) -> DataChain

Create a DataChain from the provided records. This is a low-level function that directly inserts records into the database. Unlike convenience functions like read_values() or read_csv(), you have to provide the schema and records explicitly.

Compare it with read_values() which infers schema automatically and is using higher-level abstractions which makes it less efficient. E.g. read_values() cannot handle large datasets efficiently since it needs to load all data into memory.

Parameters:

to_insert (dict | Iterable[dict] | None) –

records to insert (empty list / None to create an empty chain). Can be a list, iterator, or generator. Iterators are processed lazily without loading all records into memory at once.

    Each record must be a dictionary with keys matching the schema.
    Dictionary values can be:
    - Primitive types (str, int, etc.)
    - DataModel objects (automatically flattened to match schema)
    - Raw flattened data (e.g., {"person__name": "Alice", ...})

schema (dict[str, DataType]) –

describes chain signals and their corresponding types.

Example

import datachain as dc
from datachain import DataModel

# Simple records with primitive types
records = [
    {"name": "Alice", "age": 30},
    {"name": "Bob", "age": 25}
]
chain = dc.read_records(records, schema={"name": str, "age": int})

# Complex records with DataModel objects (automatically flattened)
class Person(DataModel):
    name: str
    age: int
    city: str

people = [
    Person(name="Alice", age=30, city="NYC"),
    Person(name="Bob", age=25, city="LA"),
]
records = [{"person": p} for p in people]
chain = dc.read_records(records, schema={"person": Person})

# Raw pre-flattened data (also works)
records = [
    {"person__name": "Alice", "person__age": 30, "person__city": "NYC"},
    {"person__name": "Bob", "person__age": 25, "person__city": "LA"},
]
chain = dc.read_records(records, schema={"person": Person})

# Using an iterator/generator for memory efficiency
def generate_records():
    for i in range(1000000):
        yield {"id": i, "value": i * 2}

chain = dc.read_records(generate_records(), schema={"id": int, "value": int})

Notes

This call blocks until all records are inserted, but iterators are processed in batches to avoid loading all data into memory at once.

Source code in datachain/lib/dc/records.py

def read_records(
    to_insert: dict | Iterable[dict] | None,
    schema: dict[str, DataType],
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
) -> "DataChain":
    """Create a DataChain from the provided records. This is a low-level function
    that directly inserts records into the database. Unlike convenience functions
    like `read_values()` or `read_csv()`, you have to provide the schema and records
    explicitly.

    Compare it with `read_values()` which infers schema automatically and is using
    higher-level abstractions which makes it less efficient. E.g. `read_values()` cannot
    handle large datasets efficiently since it needs to load all data into memory.

    Parameters:
        to_insert: records to insert (empty list / None to create an empty chain). Can
                    be a list, iterator, or generator. Iterators are processed lazily
                    without loading all records into memory at once.

                    Each record must be a dictionary with keys matching the schema.
                    Dictionary values can be:
                    - Primitive types (str, int, etc.)
                    - DataModel objects (automatically flattened to match schema)
                    - Raw flattened data (e.g., {"person__name": "Alice", ...})
        schema: describes chain signals and their corresponding types.

    Example:
        ```py
        import datachain as dc
        from datachain import DataModel

        # Simple records with primitive types
        records = [
            {"name": "Alice", "age": 30},
            {"name": "Bob", "age": 25}
        ]
        chain = dc.read_records(records, schema={"name": str, "age": int})

        # Complex records with DataModel objects (automatically flattened)
        class Person(DataModel):
            name: str
            age: int
            city: str

        people = [
            Person(name="Alice", age=30, city="NYC"),
            Person(name="Bob", age=25, city="LA"),
        ]
        records = [{"person": p} for p in people]
        chain = dc.read_records(records, schema={"person": Person})

        # Raw pre-flattened data (also works)
        records = [
            {"person__name": "Alice", "person__age": 30, "person__city": "NYC"},
            {"person__name": "Bob", "person__age": 25, "person__city": "LA"},
        ]
        chain = dc.read_records(records, schema={"person": Person})

        # Using an iterator/generator for memory efficiency
        def generate_records():
            for i in range(1000000):
                yield {"id": i, "value": i * 2}

        chain = dc.read_records(generate_records(), schema={"id": int, "value": int})
        ```

    Notes:
        This call blocks until all records are inserted, but iterators are processed
        in batches to avoid loading all data into memory at once.
    """
    if isinstance(to_insert, dict):
        to_insert = [to_insert]
    elif not to_insert:
        to_insert = []

    signal_schema = SignalSchema(schema)
    flat: Iterable[dict] = (_flatten_record(rec, signal_schema) for rec in to_insert)
    content_hash: str | None = None
    if isinstance(to_insert, (list, tuple)):
        flat = list(flat)
        content_hash = _content_hash(flat, signal_schema)

    return create_records_dataset(
        flat,
        schema,
        content_hash,
        session=session,
        settings=settings,
        in_memory=in_memory,
    )

read_storage

read_storage(
    uri: (
        str
        | PathLike[str]
        | list[str]
        | list[PathLike[str]]
    ),
    *,
    type: FileType = "binary",
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    recursive: bool | None = True,
    column: str = "file",
    update: bool = False,
    anon: bool | None = None,
    delta: bool | None = False,
    delta_on: str | Sequence[str] | None = (
        "file.path",
        "file.etag",
        "file.version",
    ),
    delta_result_on: str | Sequence[str] | None = None,
    delta_compare: str | Sequence[str] | None = None,
    delta_retry: bool | str | None = None,
    delta_unsafe: bool = False,
    client_config: dict | None = None
) -> DataChain

Get data from storage(s) as a list of file with all file attributes. It returns the chain itself as usual.

Parameters:

uri (str | PathLike[str] | list[str] | list[PathLike[str]]) –

Storage path(s) or URI(s). Can be a local path or start with a storage prefix like s3://, gs://, az://, hf:// or "file:///". Supports glob patterns: - * : wildcard - ** : recursive wildcard - ? : single character - {a,b} : brace expansion list - {1..9} : brace numeric or alphabetic range
type (FileType, default: 'binary' ) –

read file as "binary", "text", or "image" data. Default is "binary".
recursive (bool | None, default: True ) –

search recursively for the given path.
column (str, default: 'file' ) –

Column name that will contain File objects. Default is "file".
update (bool, default: False ) –

force storage reindexing. Default is False.
anon (bool | None, default: None ) –

If True, we will treat cloud bucket as public one.
client_config (dict | None, default: None ) –

Optional client configuration for the storage client.
delta (bool | None, default: False ) –

If True, only process new or changed files instead of reprocessing everything. This saves time by skipping files that were already processed in previous versions. The optimization is working when a new version of the dataset is created. Default is False.
delta_on (str | Sequence[str] | None, default: ('file.path', 'file.etag', 'file.version') ) –

Field(s) that uniquely identify each record in the source data. Used to detect which records are new or changed. Default is ("file.path", "file.etag", "file.version").
delta_result_on (str | Sequence[str] | None, default: None ) –

Field(s) in the result dataset that match delta_on fields. Only needed if you rename the identifying fields during processing. Default is None.
delta_compare (str | Sequence[str] | None, default: None ) –

Field(s) used to detect if a record has changed. If not specified, all fields except delta_on fields are used. Default is None.
delta_retry (bool | str | None, default: None ) –

Controls retry behavior for failed records: - String (field name): Reprocess records where this field is not empty (error mode) - True: Reprocess records missing from the result dataset (missing mode) - None: No retry processing (default)
delta_unsafe (bool, default: False ) –

Allow restricted ops in delta: merge, union, subtract, diff, file_diff, agg, group_by, distinct. When multiple delta sources participate in one composed query, this must be enabled on every participating delta source. Caller must ensure datasets are consistent and not partially updated.

Returns:

DataChain ( DataChain ) –

A DataChain object containing the file information.

Examples:

Simple call from s3:

import datachain as dc
dc.read_storage("s3://my-bucket/my-dir")

Match all .json files recursively using glob pattern

dc.read_storage("gs://bucket/meta/**/*.json")

Match image file extensions for directories with pattern

dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")

By ranges in filenames:

dc.read_storage("s3://bucket/202{1..4}/**/*.{jpg,jpeg,png}")

Multiple URIs:

dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])

With AWS S3-compatible storage:

dc.read_storage(
    "s3://my-bucket/my-dir",
    client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
)

Source code in datachain/lib/dc/storage.py

def read_storage(
    uri: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
    *,
    type: FileType = "binary",
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    recursive: bool | None = True,
    column: str = "file",
    update: bool = False,
    anon: bool | None = None,
    delta: bool | None = False,
    delta_on: str | Sequence[str] | None = (
        "file.path",
        "file.etag",
        "file.version",
    ),
    delta_result_on: str | Sequence[str] | None = None,
    delta_compare: str | Sequence[str] | None = None,
    delta_retry: bool | str | None = None,
    delta_unsafe: bool = False,
    client_config: dict | None = None,
) -> "DataChain":
    """Get data from storage(s) as a list of file with all file attributes.
    It returns the chain itself as usual.

    Parameters:
        uri: Storage path(s) or URI(s). Can be a local path or start with a
            storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
            Supports glob patterns:
              - `*` : wildcard
              - `**` : recursive wildcard
              - `?` : single character
              - `{a,b}` : brace expansion list
              - `{1..9}` : brace numeric or alphabetic range
        type: read file as "binary", "text", or "image" data. Default is "binary".
        recursive: search recursively for the given path.
        column: Column name that will contain File objects. Default is "file".
        update: force storage reindexing. Default is False.
        anon: If True, we will treat cloud bucket as public one.
        client_config: Optional client configuration for the storage client.
        delta: If True, only process new or changed files instead of reprocessing
            everything. This saves time by skipping files that were already processed in
            previous versions. The optimization is working when a new version of the
            dataset is created.
            Default is False.
        delta_on: Field(s) that uniquely identify each record in the source data.
            Used to detect which records are new or changed.
            Default is ("file.path", "file.etag", "file.version").
        delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
            Only needed if you rename the identifying fields during processing.
            Default is None.
        delta_compare: Field(s) used to detect if a record has changed.
            If not specified, all fields except `delta_on` fields are used.
            Default is None.
        delta_retry: Controls retry behavior for failed records:
            - String (field name): Reprocess records where this field is not empty
              (error mode)
            - True: Reprocess records missing from the result dataset (missing mode)
            - None: No retry processing (default)
        delta_unsafe: Allow restricted ops in delta: merge, union, subtract,
            diff, file_diff, agg, group_by, distinct. When multiple delta
            sources participate in one composed query, this must be enabled on
            every participating delta source. Caller must ensure datasets are
            consistent and not partially updated.

    Returns:
        DataChain: A DataChain object containing the file information.

    Examples:
        Simple call from s3:
        ```python
        import datachain as dc
        dc.read_storage("s3://my-bucket/my-dir")
        ```

        Match all .json files recursively using glob pattern
        ```py
        dc.read_storage("gs://bucket/meta/**/*.json")
        ```

        Match image file extensions for directories with pattern
        ```py
        dc.read_storage("s3://bucket/202?/**/*.{jpg,jpeg,png}")
        ```

        By ranges in filenames:
        ```py
        dc.read_storage("s3://bucket/202{1..4}/**/*.{jpg,jpeg,png}")
        ```

        Multiple URIs:
        ```python
        dc.read_storage(["s3://my-bkt/dir1", "s3://bucket2/dir2/dir3"])
        ```

        With AWS S3-compatible storage:
        ```python
        dc.read_storage(
            "s3://my-bucket/my-dir",
            client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
        )
        ```
    """
    from .datasets import read_dataset
    from .records import create_records_dataset, read_records

    file_type = get_file_type(type)

    uris = uri if isinstance(uri, (list, tuple)) else [uri]

    if not uris:
        raise ValueError("No URIs provided")

    for single_uri in uris:
        validate_cloud_bucket_name(str(single_uri))

    probe_config = client_config or (
        session.catalog.client_config if session is not None else None
    )

    if (
        anon is None
        and not _backends_have_credentials(uris, probe_config)
        and _all_buckets_anonymous(uris, probe_config)
    ):
        anon = True

    if anon is not None:
        client_config = (client_config or {}) | {"anon": anon}
    session = Session.get(session, client_config=client_config, in_memory=in_memory)
    catalog = session.catalog
    cache = catalog.cache
    client_config = session.catalog.client_config
    if anon is not None:
        # Session.get discards our client_config when an existing session is
        # passed. Re-apply anon locally for the listing path without mutating
        # the caller's session.
        client_config = client_config | {"anon": anon}
    listing_namespace_name = catalog.metastore.system_namespace_name
    listing_project_name = catalog.metastore.listing_project_name

    # Then expand all URIs that contain brace patterns
    expanded_uris = []
    for single_uri in uris:
        uri_str = str(single_uri)
        expanded_uris.extend(expand_brace_pattern(uri_str))

    # Now process each expanded URI
    chains = []
    listed_ds_name = set()
    file_values = []

    updated_uris = set()

    for single_uri in expanded_uris:
        # Check if URI contains glob patterns and split them
        base_uri, glob_pattern = split_uri_pattern(single_uri)

        # If a pattern is found, use the base_uri for listing
        # The pattern will be used for filtering later
        list_uri_to_use = base_uri if glob_pattern else single_uri

        # Avoid double updates for the same URI
        update_single_uri = False
        if update and (list_uri_to_use not in updated_uris):
            updated_uris.add(list_uri_to_use)
            update_single_uri = True

        list_ds_name, list_uri, list_path, _ = get_listing(
            list_uri_to_use, session, update=update_single_uri
        )

        # list_ds_name is None if object is a file, we don't want to use cache
        # or do listing in that case - just read that single object
        if not list_ds_name:
            file_values.append(
                get_file_info(list_uri, cache, client_config=client_config)
            )
            continue

        dc = read_dataset(
            list_ds_name,
            namespace=listing_namespace_name,
            project=listing_project_name,
            session=session,
            settings=settings,
            delta=delta,
            delta_on=delta_on,
            delta_result_on=delta_result_on,
            delta_compare=delta_compare,
            delta_retry=delta_retry,
            delta_unsafe=delta_unsafe,
        )
        dc._query.update = update
        dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})

        def lst_fn(ds_name, lst_uri):
            # Seed for .gen() iteration. content_hash=None because hash_callable
            # doesn't capture list_func's closure (which holds `lst_uri`, `cache`,
            # `client_config`) -- auto-hashing the seed would let the UDF
            # checkpoint cache return a stale listing across URIs/runs.
            (
                create_records_dataset(
                    [{"seed": 0}],
                    schema={"seed": int},
                    content_hash=None,
                    session=session,
                    settings=settings,
                    in_memory=in_memory,
                )
                .settings(
                    prefetch=0,
                    namespace=listing_namespace_name,
                    project=listing_project_name,
                )
                .gen(
                    list_bucket(lst_uri, cache, client_config=client_config),
                    output={f"{column}": file_type},
                )
                # for internal listing datasets, we always bump major version
                .save(ds_name, listing=True, update_version="major")
            )

        # Always attach listing_fn so resolve_listing can refresh stale listings.
        dc._query.set_listing_fn(
            lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
        )

        # If a glob pattern was detected, use it for filtering
        # Otherwise, use the original list_path from get_listing
        if glob_pattern:
            # Determine if we should use recursive listing based on the pattern
            use_recursive = should_use_recursion(glob_pattern, recursive or False)

            # Apply glob filter - no need for brace expansion here as it's done above
            chain = apply_glob_filter(
                dc, glob_pattern, list_path, use_recursive, column
            )
            chains.append(chain)
        else:
            # No glob pattern detected, use normal ls behavior
            chains.append(ls(dc, list_path, recursive=recursive, column=column))

        listed_ds_name.add(list_ds_name)

    storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)

    if file_values:
        # Use read_records directly (not read_values) so the chain hash is
        # derived from the flattened File records (deterministic across runs) —
        # needed for checkpoint reuse on single-file read_storage /
        # read_csv / read_parquet.
        file_chain = read_records(
            [{"file": f} for f in file_values],
            schema={"file": file_type},
            session=session,
            settings=settings,
            in_memory=in_memory,
        )
        file_chain.signals_schema = file_chain.signals_schema.mutate(
            {f"{column}": file_type}
        )
        storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain

    assert storage_chain is not None

    return storage_chain

read_zarr

read_zarr(
    path: (
        str
        | PathLike[str]
        | list[str]
        | list[PathLike[str]]
    ),
    column: str = "zarr",
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    **kwargs
) -> DataChain

Generate a chain with one row per Zarr store.

Unlike :func:read_storage, which emits one row per physical object, this reader collapses every object under a store root into a single :class:~datachain.lib.zarr.ZarrStore row. Stores are discovered using the standard *.zarr naming convention.

Parameters:

path (str | PathLike[str] | list[str] | list[PathLike[str]]) –

Storage path(s) or URI(s). Can be a local path or start with a storage prefix like s3://, gs://, az:// or file://. Supports glob patterns.
column (str, default: 'zarr' ) –

Created column name. Defaults to "zarr".
session (Session | None, default: None ) –

Session to use for the chain.
settings (dict | None, default: None ) –

Settings to use for the chain.
in_memory (bool, default: False ) –

If True, use an in-memory database.

Example

import datachain as dc
chain = dc.read_zarr("s3://mybucket/data/")
for (store,) in chain.limit(1).to_iter("zarr"):
    print(store.get_info())

Source code in datachain/lib/dc/zarr.py

def read_zarr(
    path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
    column: str = "zarr",
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    **kwargs,
) -> "DataChain":
    """Generate a chain with one row per Zarr store.

    Unlike :func:`read_storage`, which emits one row per physical object, this
    reader collapses every object under a store root into a single
    :class:`~datachain.lib.zarr.ZarrStore` row.  Stores are discovered using the
    standard ``*.zarr`` naming convention.

    Parameters:
        path: Storage path(s) or URI(s). Can be a local path or start with a
            storage prefix like `s3://`, `gs://`, `az://` or `file://`.
            Supports glob patterns.
        column: Created column name. Defaults to ``"zarr"``.
        session: Session to use for the chain.
        settings: Settings to use for the chain.
        in_memory: If True, use an in-memory database.

    Example:
        ```py
        import datachain as dc
        chain = dc.read_zarr("s3://mybucket/data/")
        for (store,) in chain.limit(1).to_iter("zarr"):
            print(store.get_info())
        ```
    """
    from datachain.lib.zarr import (
        ZARR_ROOT_MARKERS,
        ZARR_SUFFIX,
        ZarrStore,
        file_to_store,
    )

    from .datachain import C
    from .storage import read_storage

    # ``read_storage`` lists the whole prefix recursively (markers *and*
    # chunks); we then keep only the store-root markers.
    chain = read_storage(
        path,
        session=session,
        settings=settings,
        in_memory=in_memory,
        **kwargs,
    )

    # Keep only store-root markers (drop chunks and nested array/group
    # metadata). A marker sits either directly under a ``*.zarr`` directory
    # (discovery) or at the listing root when the path itself is a concrete
    # store directory.
    conditions = []
    for marker in sorted(ZARR_ROOT_MARKERS):
        conditions.append(C("file.path").glob(f"*{ZARR_SUFFIX}/{marker}"))
        conditions.append(C("file.path") == marker)
    chain = chain.filter(reduce(lambda a, b: a | b, conditions))

    return chain.map(file_to_store, output={column: ZarrStore})

read_values

read_values(
    ds_name: str = "",
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    output: OutputType = None,
    column: str = "",
    **fr_map
) -> DataChain

Generate chain from list of values.

Example

import datachain as dc
dc.read_values(fib=[1, 2, 3, 5, 8])

Source code in datachain/lib/dc/values.py

def read_values(
    ds_name: str = "",
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    output: OutputType = None,
    column: str = "",
    **fr_map,
) -> "DataChain":
    """Generate chain from list of values.

    Example:
        ```py
        import datachain as dc
        dc.read_values(fib=[1, 2, 3, 5, 8])
        ```
    """
    tuple_type, output, tuples = values_to_tuples(ds_name, output, **fr_map)

    def _func_fr() -> Iterator[tuple_type]:  # type: ignore[valid-type]
        yield from tuples

    _func_fr.__name__ = "read_values"

    # Seed for .gen() iteration. content_hash=None because hash_callable
    # doesn't capture _func_fr's closure (which holds `tuples`) — auto-hashing
    # the seed would collide every read_values() call saved under the same
    # name. Drop this once hash_callable becomes closure-aware.
    chain = create_records_dataset(
        [{"seed": 0}],
        schema={"seed": int},
        content_hash=None,
        session=session,
        settings=settings,
        in_memory=in_memory,
    )
    if column:
        output = {column: dict_to_data_model(column, output)}  # type: ignore[arg-type]
    return chain.gen(_func_fr, output=output)

read_database

read_database(
    query: str | Executable,
    connection: ConnectionType,
    params: (
        Sequence[Mapping[str, Any]]
        | Mapping[str, Any]
        | None
    ) = None,
    *,
    output: dict[str, DataType] | None = None,
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    infer_schema_length: int | None = 100
) -> DataChain

Read the results of a SQL query into a DataChain.

Parameters:

query (str | Executable) –

The SQL query to execute. Can be a raw SQL string or a SQLAlchemy Executable object.
connection (ConnectionType) –

SQLAlchemy connectable, str, or a sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible for engine disposal and connection closure for the SQLAlchemy connectable; str connections are closed automatically.
params (Sequence[Mapping[str, Any]] | Mapping[str, Any] | None, default: None ) –

Parameters to pass to execute method.
output (dict[str, DataType] | None, default: None ) –

A dictionary mapping column names to types, used to override the schema inferred from the query results.
session (Session | None, default: None ) –

Session to use for the chain.
settings (dict | None, default: None ) –

Settings to use for the chain.
in_memory (bool, default: False ) –

If True, creates an in-memory session. Defaults to False.
infer_schema_length (int | None, default: 100 ) –

The maximum number of rows to scan for inferring schema. If set to None, the full data may be scanned. The rows used for schema inference are stored in memory, so large values can lead to high memory usage. Only applies if the output parameter is not set for the given column.

Examples:

Reading from a SQL query against a user-supplied connection:

query = "SELECT key, value FROM tbl"
chain = dc.read_database(query, connection, output={"value": float})

Load data from a SQLAlchemy driver/engine:

from sqlalchemy import create_engine
engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
chain = dc.read_database("select * from tbl", engine)

Load data from a parameterized SQLAlchemy query:

query = "SELECT key, value FROM tbl WHERE value > :value"
dc.read_database(query, engine, params={"value": 50})

Notes

This function works with a variety of databases — including, but not limited to, SQLite, DuckDB, PostgreSQL, and Snowflake, provided the appropriate driver is installed.
This call is blocking, and will execute the query and return once the results are saved.

Source code in datachain/lib/dc/database.py

def read_database(
    query: "str | sqlalchemy.sql.expression.Executable",
    connection: "ConnectionType",
    params: Sequence[Mapping[str, Any]] | Mapping[str, Any] | None = None,
    *,
    output: dict[str, "DataType"] | None = None,
    session: "Session | None" = None,
    settings: dict | None = None,
    in_memory: bool = False,
    infer_schema_length: int | None = 100,
) -> "DataChain":
    """Read the results of a SQL query into a DataChain.

    Args:
        query:
            The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
            `Executable` object.
        connection: SQLAlchemy connectable, str, or a sqlite3 connection
            Using SQLAlchemy makes it possible to use any DB supported by that
            library. If a DBAPI2 object, only sqlite3 is supported. The user is
            responsible for engine disposal and connection closure for the
            SQLAlchemy connectable; str connections are closed automatically.
        params: Parameters to pass to execute method.
        output: A dictionary mapping column names to types, used to override the
            schema inferred from the query results.
        session: Session to use for the chain.
        settings: Settings to use for the chain.
        in_memory: If True, creates an in-memory session. Defaults to False.
        infer_schema_length:
            The maximum number of rows to scan for inferring schema.
            If set to `None`, the full data may be scanned.
            The rows used for schema inference are stored in memory,
            so large values can lead to high memory usage.
            Only applies if the `output` parameter is not set for the given column.

    Examples:
        Reading from a SQL query against a user-supplied connection:
        ```python
        query = "SELECT key, value FROM tbl"
        chain = dc.read_database(query, connection, output={"value": float})
        ```

        Load data from a SQLAlchemy driver/engine:
        ```python
        from sqlalchemy import create_engine
        engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
        chain = dc.read_database("select * from tbl", engine)
        ```

        Load data from a parameterized SQLAlchemy query:
        ```python
        query = "SELECT key, value FROM tbl WHERE value > :value"
        dc.read_database(query, engine, params={"value": 50})
        ```

    Notes:
        - This function works with a variety of databases — including,
        but not limited to, SQLite, DuckDB, PostgreSQL, and Snowflake,
        provided the appropriate driver is installed.
        - This call is blocking, and will execute the query and return once the
          results are saved.
    """
    from datachain.lib.dc.records import read_records

    output = output or {}
    if isinstance(query, str):
        query = sqlalchemy.text(query)
    kw = {"execution_options": {"stream_results": True}}  # use server-side cursors
    with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
        cols = result.keys()
        to_infer = [k for k in cols if k not in output]  # preserve the order
        rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
        records = (row._asdict() for row in itertools.chain(rows, result))
        return read_records(
            records,
            session=session,
            settings=settings,
            in_memory=in_memory,
            schema=inferred_schema | output,
        )

datasets

datasets(
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    column: str | None = None,
    include_listing: bool = False,
    studio: bool = False,
    attrs: list[str] | None = None,
) -> DataChain

Generate chain with list of registered datasets.

Parameters:

session (Session | None, default: None ) –

Optional session instance. If not provided, uses default session.
settings (dict | None, default: None ) –

Optional dictionary of settings to configure the chain.
in_memory (bool, default: False ) –

If True, creates an in-memory session. Defaults to False.
column (str | None, default: None ) –

Name of the output column in the chain. Defaults to None which means no top level column will be created.
include_listing (bool, default: False ) –

If True, includes listing datasets. Defaults to False.
studio (bool, default: False ) –

If True, returns datasets from Studio only, otherwise returns all local datasets. Defaults to False.
attrs (list[str] | None, default: None ) –

Optional list of attributes to filter datasets on. It can be just attribute without value e.g "NLP", or attribute with value e.g "location=US". Attribute with value can also accept "" to target all that have specific name e.g "location="

Returns:

DataChain ( DataChain ) –

A new DataChain instance containing dataset information.

Example

import datachain as dc

chain = dc.datasets(column="dataset")
for ds in chain.to_iter("dataset"):
    print(f"{ds.name}@v{ds.version}")

Source code in datachain/lib/dc/datasets.py

def datasets(
    session: Session | None = None,
    settings: dict | None = None,
    in_memory: bool = False,
    column: str | None = None,
    include_listing: bool = False,
    studio: bool = False,
    attrs: list[str] | None = None,
) -> "DataChain":
    """Generate chain with list of registered datasets.

    Args:
        session: Optional session instance. If not provided, uses default session.
        settings: Optional dictionary of settings to configure the chain.
        in_memory: If True, creates an in-memory session. Defaults to False.
        column: Name of the output column in the chain. Defaults to None which
            means no top level column will be created.
        include_listing: If True, includes listing datasets. Defaults to False.
        studio: If True, returns datasets from Studio only,
            otherwise returns all local datasets. Defaults to False.
        attrs: Optional list of attributes to filter datasets on. It can be just
            attribute without value e.g "NLP", or attribute with value
            e.g "location=US". Attribute with value can also accept "*" to target
            all that have specific name e.g "location=*"

    Returns:
        DataChain: A new DataChain instance containing dataset information.

    Example:
        ```py
        import datachain as dc

        chain = dc.datasets(column="dataset")
        for ds in chain.to_iter("dataset"):
            print(f"{ds.name}@v{ds.version}")
        ```
    """
    session = Session.get(session, in_memory=in_memory)
    catalog = session.catalog

    datasets_values = [
        DatasetInfo.from_models(d, v, j)
        for d, v, j in catalog.list_datasets_versions(
            include_listing=include_listing, studio=studio
        )
    ]
    datasets_values = [d for d in datasets_values if not d.is_temp]

    if attrs:
        for attr in attrs:
            datasets_values = [d for d in datasets_values if d.has_attr(attr)]

    if not column:
        # flattening dataset fields
        schema = {
            k: get_origin(v) if get_origin(v) is dict else v
            for k, v in get_type_hints(DatasetInfo).items()
            if k in DatasetInfo.model_fields
        }
        data = {k: [] for k in DatasetInfo.model_fields}  # type: ignore[var-annotated]
        for d in [d.model_dump() for d in datasets_values]:
            for field, value in d.items():
                data[field].append(value)

        return read_values(
            session=session,
            settings=settings,
            in_memory=in_memory,
            output=schema,
            **data,  # type: ignore[arg-type]
        )

    return read_values(
        session=session,
        settings=settings,
        in_memory=in_memory,
        output={column: DatasetInfo},
        **{column: datasets_values},  # type: ignore[arg-type]
    )

delete_dataset

delete_dataset(
    name: str,
    namespace: str | None = None,
    project: str | None = None,
    version: str | None = None,
    force: bool | None = False,
    studio: bool | None = False,
    session: Session | None = None,
    in_memory: bool = False,
) -> None

Removes specific dataset version or all dataset versions, depending on a force flag.

The rows table is dropped but the version metadata is kept so the semver stays reserved and dependents can still resolve lineage.

Parameters:

name (str) –

The dataset name, which can be a fully qualified name including the namespace and project. Alternatively, it can be a regular name, in which case the explicitly defined namespace and project will be used if they are set; otherwise, default values will be applied.
namespace (str | None, default: None ) –

optional name of namespace in which dataset to delete is created
project (str | None, default: None ) –

optional name of project in which dataset to delete is created
version (str | None, default: None ) –

Optional dataset version
force (bool | None, default: False ) –

If true, all datasets versions will be removed. Defaults to False.
studio (bool | None, default: False ) –

If True, removes dataset from Studio only, otherwise removes local dataset. Defaults to False.
session (Session | None, default: None ) –

Optional session instance. If not provided, uses default session.
in_memory (bool, default: False ) –

If True, creates an in-memory session. Defaults to False.

Returns: None

Example

import datachain as dc
dc.delete_dataset("cats")

import datachain as dc
dc.delete_dataset("cats", version="1.0.0")

Source code in datachain/lib/dc/datasets.py

def delete_dataset(
    name: str,
    namespace: str | None = None,
    project: str | None = None,
    version: str | None = None,
    force: bool | None = False,
    studio: bool | None = False,
    session: Session | None = None,
    in_memory: bool = False,
) -> None:
    """Removes specific dataset version or all dataset versions, depending on
    a force flag.

    The rows table is dropped but the version metadata is kept so the semver
    stays reserved and dependents can still resolve lineage.

    Args:
        name: The dataset name, which can be a fully qualified name including the
            namespace and project. Alternatively, it can be a regular name, in which
            case the explicitly defined namespace and project will be used if they are
            set; otherwise, default values will be applied.
        namespace: optional name of namespace in which dataset to delete is created
        project: optional name of project in which dataset to delete is created
        version: Optional dataset version
        force: If true, all datasets versions will be removed. Defaults to False.
        studio: If True, removes dataset from Studio only, otherwise removes local
            dataset. Defaults to False.
        session: Optional session instance. If not provided, uses default session.
        in_memory: If True, creates an in-memory session. Defaults to False.

    Returns: None

    Example:
        ```py
        import datachain as dc
        dc.delete_dataset("cats")
        ```

        ```py
        import datachain as dc
        dc.delete_dataset("cats", version="1.0.0")
        ```
    """
    from datachain.studio import remove_studio_dataset

    name, name_version = parse_dataset_with_version(name)
    if version is None:
        version = name_version

    session = Session.get(session, in_memory=in_memory)
    catalog = session.catalog

    namespace_name, project_name, name = catalog.get_full_dataset_name(
        name,
        project_name=project,
        namespace_name=namespace,
    )

    if not is_studio() and studio:
        return remove_studio_dataset(
            None, name, namespace_name, project_name, version=version, force=force
        )

    try:
        ds_project = get_project(project_name, namespace_name, session=session)
    except ProjectNotFoundError:
        raise DatasetNotFoundError(
            f"Dataset {name} not found in namespace {namespace_name} and project",
            f" {project_name}",
        ) from None

    if not force:
        version = (
            version
            or catalog.get_dataset(
                name,
                namespace_name=ds_project.namespace.name,
                project_name=ds_project.name,
                versions=None,
                include_incomplete=False,
            ).latest_version
        )
    else:
        version = None
    catalog.remove_dataset(name, ds_project, version=version, force=force)

move_dataset

move_dataset(
    src: str,
    dest: str,
    session: Session | None = None,
    in_memory: bool = False,
) -> None

Moves an entire dataset between namespaces and projects.

Parameters:

src (str) –

The source dataset name. This can be a fully qualified name that includes the namespace and project, or a regular name. If a regular name is used, default values will be applied. The source dataset will no longer exist after the move.
dest (str) –

The destination dataset name. This can also be a fully qualified name with a namespace and project, or just a regular name (default values will be used in that case). The original dataset will be moved here.
session (Session | None, default: None ) –

An optional session instance. If not provided, the default session will be used.
in_memory (bool, default: False ) –

If True, creates an in-memory session. Defaults to False.

Returns:

None –

None

Examples:

import datachain as dc
dc.move_dataset("cats", "new_cats")

import datachain as dc
dc.move_dataset("dev.animals.cats", "prod.animals.cats")

Source code in datachain/lib/dc/datasets.py

def move_dataset(
    src: str,
    dest: str,
    session: Session | None = None,
    in_memory: bool = False,
) -> None:
    """Moves an entire dataset between namespaces and projects.

    Args:
        src: The source dataset name. This can be a fully qualified name that includes
            the namespace and project, or a regular name. If a regular name is used,
            default values will be applied. The source dataset will no longer exist
            after the move.
        dest: The destination dataset name. This can also be a fully qualified
            name with a namespace and project, or just a regular name (default values
            will be used in that case). The original dataset will be moved here.
        session: An optional session instance. If not provided, the default session
            will be used.
        in_memory: If True, creates an in-memory session. Defaults to False.

    Returns:
        None

    Examples:
        ```python
        import datachain as dc
        dc.move_dataset("cats", "new_cats")
        ```

        ```python
        import datachain as dc
        dc.move_dataset("dev.animals.cats", "prod.animals.cats")
        ```
    """
    session = Session.get(session, in_memory=in_memory)
    catalog = session.catalog

    namespace, project, name = catalog.get_full_dataset_name(src)
    dest_namespace, dest_project, dest_name = catalog.get_full_dataset_name(dest)

    dataset = catalog.get_dataset(
        name,
        namespace_name=namespace,
        project_name=project,
        versions=None,
        include_incomplete=False,
    )

    catalog.update_dataset(
        dataset,
        name=dest_name,
        project_id=catalog.metastore.get_project(
            dest_project,
            dest_namespace,
            create=is_studio(),
        ).id,
    )

delete_namespace

delete_namespace(
    name: str, session: Session | None = None
) -> None

Removes a namespace by name.

Raises:

NamespaceNotFoundError –

If the namespace does not exist.
NamespaceDeleteNotAllowedError –

If the namespace is non-empty, is the default namespace, or is a system namespace, as these cannot be removed.

Parameters:

name (str) –

The name of the namespace.
session (Session | None, default: None ) –

Session to use for getting project.

Example

import datachain as dc
dc.delete_namespace("dev")

Source code in datachain/lib/namespaces.py

def delete_namespace(name: str, session: Session | None = None) -> None:
    """
    Removes a namespace by name.

    Raises:
        NamespaceNotFoundError: If the namespace does not exist.
        NamespaceDeleteNotAllowedError: If the namespace is non-empty,
            is the default namespace, or is a system namespace,
            as these cannot be removed.

    Parameters:
        name: The name of the namespace.
        session: Session to use for getting project.

    Example:
        ```py
        import datachain as dc
        dc.delete_namespace("dev")
        ```
    """
    session = Session.get(session)
    metastore = session.catalog.metastore

    namespace_name, project_name = parse_name(name)

    if project_name:
        return delete_project(project_name, namespace_name, session)

    namespace = metastore.get_namespace(name)

    if name == metastore.system_namespace_name:
        raise NamespaceDeleteNotAllowedError(
            f"Namespace {metastore.system_namespace_name} cannot be removed"
        )

    if name == metastore.default_namespace_name:
        raise NamespaceDeleteNotAllowedError(
            f"Namespace {metastore.default_namespace_name} cannot be removed"
        )

    num_projects = metastore.count_projects(namespace.id)
    if num_projects > 0:
        raise NamespaceDeleteNotAllowedError(
            f"Namespace cannot be removed. It contains {num_projects} project(s). "
            "Please remove the project(s) first."
        )

    metastore.remove_namespace(namespace.id)

is_studio

is_studio() -> bool

Check if the runtime environment is Studio (not local).

Source code in datachain/lib/dc/utils.py

def is_studio() -> bool:
    """Check if the runtime environment is Studio (not local)."""
    return getenv_bool("DATACHAIN_IS_STUDIO", default=False)

is_local

is_local() -> bool

Check if the runtime environment is local (not Studio).

Source code in datachain/lib/dc/utils.py

def is_local() -> bool:
    """Check if the runtime environment is local (not Studio)."""
    return not is_studio()

Column

Column(
    text, type_=None, is_literal=False, _selectable=None
)

Bases: ColumnClause

Source code in datachain/query/schema.py

def __init__(self, text, type_=None, is_literal=False, _selectable=None):
    """Dataset column."""
    self.name = ColumnMeta.to_db_name(text)
    super().__init__(
        self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
    )

glob

glob(glob_str)

Search for matches using glob pattern matching.

Source code in datachain/query/schema.py

def glob(self, glob_str):
    """Search for matches using glob pattern matching."""
    return self.op("GLOB")(glob_str)

regexp

regexp(regexp_str)

Search for matches using regexp pattern matching.

Source code in datachain/query/schema.py

def regexp(self, regexp_str):
    """Search for matches using regexp pattern matching."""
    return self.op("REGEXP")(regexp_str)

ColumnExpr

A column expression - either a Column reference like C("file.size") or any arithmetic or comparison expression built from columns.

C("width") * C("height")
C("file.size") // 1024
(C("score") > 0.5) | (C("label") == "positive")

DataChainSchema

DataChainSchema(signal_schema: SignalSchema)

Bases: dict[str, DataType]

Dict-like public view of a DataChain schema.

Top-level schema fields are available through the standard dict API. Use :meth:flatten for leaf columns and :meth:to_string for the printable tree format.

Source code in datachain/lib/dc/datachain.py

def __init__(self, signal_schema: SignalSchema) -> None:
    """Build the view from a ``SignalSchema``."""
    self._signal_schema = signal_schema
    super().__init__(signal_schema.values)

str

__str__() -> str

Return the printable schema tree.

Source code in datachain/lib/dc/datachain.py

def __str__(self) -> str:
    """Return the printable schema tree."""
    return self.to_string()

flatten

flatten(include_hidden: bool = True) -> dict[str, DataType]

Return flattened leaf column names and their types.

Parameters:

include_hidden (bool, default: True ) –

Whether to include hidden fields from complex signals.

Source code in datachain/lib/dc/datachain.py

def flatten(self, include_hidden: bool = True) -> dict[str, DataType]:
    """Return flattened leaf column names and their types.

    Parameters:
        include_hidden: Whether to include hidden fields from complex signals.
    """
    return {
        ".".join(path): type_
        for path, type_, has_subtree, _ in self._signal_schema.get_flat_tree(
            include_hidden=include_hidden
        )
        if not has_subtree
    }

to_string

to_string(
    include_hidden: bool = True, indent: int = 2
) -> str

Return the schema as an indented tree.

Parameters:

include_hidden (bool, default: True ) –

Whether to include hidden fields from complex signals.
indent (int, default: 2 ) –

Number of spaces to indent nested fields.

Source code in datachain/lib/dc/datachain.py

def to_string(self, include_hidden: bool = True, indent: int = 2) -> str:
    """Return the schema as an indented tree.

    Parameters:
        include_hidden: Whether to include hidden fields from complex signals.
        indent: Number of spaces to indent nested fields.
    """
    file = io.StringIO()
    self._signal_schema.print_tree(
        indent=indent,
        include_hidden=include_hidden,
        file=file,
    )
    return file.getvalue().removesuffix("\n")

DataChain

DataChain(
    query: DatasetQuery,
    settings: Settings,
    signal_schema: SignalSchema,
    setup: dict | None = None,
    _sys: bool = False,
)

DataChain - a data structure for batch data processing and evaluation.

It represents a sequence of data manipulation steps such as reading data from storages, running AI or LLM models or calling external services API to validate or enrich data.

Data in DataChain is presented as Python classes with arbitrary set of fields, including nested classes. The data classes have to inherit from DataModel class. The supported set of field types include: majority of the type supported by the underlyind library Pydantic.

dataset `property`

dataset: DatasetRecord | None

Underlying dataset, if there is one.

delta `property`

delta: bool

Return True if this chain is running in "delta" update mode.

delta_unsafe `property`

delta_unsafe: bool

Returns True if the chain runs in unsafe "delta" update mode.

empty `property`

empty: bool

Return True if the chain has zero rows.

job `property`

job: Job

Get the job for this chain.

Returns the existing job if running in SaaS, or creates a new one if running locally.

name `property`

name: str | None

Name of the underlying dataset, if there is one.

namespace_name `property`

namespace_name: str

Current namespace name in which the chain is running.

project_name `property`

project_name: str

Current project name in which the chain is running.

schema `property`

schema: DataChainSchema

Get a dict-like schema view of the chain.

The returned object maps top-level signal names to Python types and can also produce leaf-column views:

ds.schema                 # {"file": File, "score": float}
ds.schema.flatten()       # {"file.path": str, "file.size": int, ...}
print(ds.schema)          # printable nested schema tree

session `property`

session: Session

Session of the chain.

version `property`

version: str | None

Version of the underlying dataset, if there is one.

iter

__iter__() -> Iterator[tuple[DataValue, ...]]

Make DataChain objects iterable.

Yields:

tuple[DataValue, ...] –

Yields tuples of all column values for each row.

Example

for row in chain:
    print(row)

Source code in datachain/lib/dc/datachain.py

def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
    """Make DataChain objects iterable.

    Yields:
        (tuple[DataValue, ...]): Yields tuples of all column values for each row.

    Example:
        ```py
        for row in chain:
            print(row)
        ```
    """
    return self.to_iter()

or

__or__(other: Self) -> Self

Return self.union(other).

Source code in datachain/lib/dc/datachain.py

def __or__(self, other: "Self") -> "Self":
    """Return `self.union(other)`."""
    return self.union(other)

repr

__repr__() -> str

Return a string representation of the chain.

Source code in datachain/lib/dc/datachain.py

def __repr__(self) -> str:
    """Return a string representation of the chain."""
    classname = self.__class__.__name__
    if not self._effective_signals_schema.values:
        return f"Empty {classname}"

    return f"{self.schema}\n"

agg

agg(
    func: Callable | None = None,
    partition_by: PartitionByType | None = None,
    params: str | Sequence[str] | None = None,
    output: OutputType = None,
    **signal_map: Callable
) -> Self

Aggregate rows using partition_by statement and apply a function to the groups of aggregated rows. The function needs to return new objects for each group of the new rows. It returns a chain itself with new signals.

Input-output relationship: N:M

This method bears similarity to gen() and map(), employing a comparable set of parameters, yet differs in two crucial aspects:

The partition_by parameter: This specifies the column name or a list of column names that determine the grouping criteria for aggregation.
Group-based UDF function input: Instead of individual rows, the function receives a list of all rows within each group defined by partition_by.

If partition_by is not set or is an empty list, all rows will be placed into a single group.

Parameters:

func (Callable | None, default: None ) –

Function applied to each group of rows.
partition_by (PartitionByType | None, default: None ) –

Column name(s) to group by. If None, all rows go into one group.
params (str | Sequence[str] | None, default: None ) –

List of column names used as input for the function. Default is taken from function signature.
output (OutputType, default: None ) –

Dictionary defining new signals and their corresponding types. Default type is taken from function signature.
**signal_map (Callable, default: {} ) –

kwargs can be used to define func together with its return signal name in format of agg(result_column=my_func).

Examples:

Basic aggregation with lambda function:

chain = chain.agg(
    total=lambda category, amount: [sum(amount)],
    output=float,
    partition_by="category",
)
chain.save("new_dataset")

An alternative syntax, when you need to specify a more complex function:

# It automatically resolves which columns to pass to the function
# by looking at the function signature.
def agg_sum(
    file: list[File], amount: list[float]
) -> Iterator[tuple[File, float]]:
    yield file[0], sum(amount)

chain = chain.agg(
    agg_sum,
    output={"file": File, "total": float},
    # Alternative syntax is to use `C` (short for Column) to specify
    # a column name or a nested column, e.g. C("file.path").
    partition_by=C("category"),
)
chain.save("new_dataset")

Using complex signals for partitioning (File or any Pydantic BaseModel):

def my_agg(files: list[File]) -> Iterator[tuple[File, int]]:
    yield files[0], sum(f.size for f in files)

chain = chain.agg(
    my_agg,
    params=("file",),
    output={"file": File, "total": int},
    partition_by="file",  # Column referring to all sub-columns of File
)
chain.save("new_dataset")

Aggregating all rows into a single group (when partition_by is not set):

chain = chain.agg(
    total_size=lambda file, size: [sum(size)],
    output=int,
    # No partition_by specified - all rows go into one group
)
chain.save("new_dataset")

Multiple partition columns:

chain = chain.agg(
    total=lambda category, subcategory, amount: [sum(amount)],
    output=float,
    partition_by=["category", "subcategory"],
)
chain.save("new_dataset")

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def agg(
    self,
    /,
    func: Callable | None = None,
    partition_by: PartitionByType | None = None,
    params: str | Sequence[str] | None = None,
    output: OutputType = None,
    **signal_map: Callable,
) -> "Self":
    """Aggregate rows using `partition_by` statement and apply a function to the
    groups of aggregated rows. The function needs to return new objects for each
    group of the new rows. It returns a chain itself with new signals.

    Input-output relationship: N:M

    This method bears similarity to `gen()` and `map()`, employing a comparable set
    of parameters, yet differs in two crucial aspects:

    1. The `partition_by` parameter: This specifies the column name or a list of
       column names that determine the grouping criteria for aggregation.
    2. Group-based UDF function input: Instead of individual rows, the function
       receives a list of all rows within each group defined by `partition_by`.

    If `partition_by` is not set or is an empty list, all rows will be placed
    into a single group.

    Parameters:
        func: Function applied to each group of rows.
        partition_by: Column name(s) to group by. If None, all rows go
            into one group.
        params: List of column names used as input for the function. Default is
            taken from function signature.
        output: Dictionary defining new signals and their corresponding types.
            Default type is taken from function signature.
        **signal_map: kwargs can be used to define `func` together with its return
            signal name in format of `agg(result_column=my_func)`.

    Examples:
        Basic aggregation with lambda function:
        ```py
        chain = chain.agg(
            total=lambda category, amount: [sum(amount)],
            output=float,
            partition_by="category",
        )
        chain.save("new_dataset")
        ```

        An alternative syntax, when you need to specify a more complex function:
        ```py
        # It automatically resolves which columns to pass to the function
        # by looking at the function signature.
        def agg_sum(
            file: list[File], amount: list[float]
        ) -> Iterator[tuple[File, float]]:
            yield file[0], sum(amount)

        chain = chain.agg(
            agg_sum,
            output={"file": File, "total": float},
            # Alternative syntax is to use `C` (short for Column) to specify
            # a column name or a nested column, e.g. C("file.path").
            partition_by=C("category"),
        )
        chain.save("new_dataset")
        ```

        Using complex signals for partitioning (`File` or any Pydantic `BaseModel`):
        ```py
        def my_agg(files: list[File]) -> Iterator[tuple[File, int]]:
            yield files[0], sum(f.size for f in files)

        chain = chain.agg(
            my_agg,
            params=("file",),
            output={"file": File, "total": int},
            partition_by="file",  # Column referring to all sub-columns of File
        )
        chain.save("new_dataset")
        ```

        Aggregating all rows into a single group (when `partition_by` is not set):
        ```py
        chain = chain.agg(
            total_size=lambda file, size: [sum(size)],
            output=int,
            # No partition_by specified - all rows go into one group
        )
        chain.save("new_dataset")
        ```

        Multiple partition columns:
        ```py
        chain = chain.agg(
            total=lambda category, subcategory, amount: [sum(amount)],
            output=float,
            partition_by=["category", "subcategory"],
        )
        chain.save("new_dataset")
        ```
    """
    if partition_by is not None:
        # Convert string partition_by parameters to Column objects
        if isinstance(partition_by, (str, Function, ColumnExpr)):
            list_partition_by = [partition_by]
        else:
            list_partition_by = list(partition_by)

        processed_partition_columns: list[ColumnExpr] = []
        for col in list_partition_by:
            if isinstance(col, str):
                columns = self.signals_schema.db_signals(name=col, as_columns=True)
                if not columns:
                    raise SignalResolvingError([col], "is not found")
                processed_partition_columns.extend(cast("list[Column]", columns))
            elif isinstance(col, Function):
                column = col.get_column(self.signals_schema)
                processed_partition_columns.append(column)
            else:
                # Assume it's already a ColumnExpr
                processed_partition_columns.append(col)

        processed_partition_by = processed_partition_columns
    else:
        processed_partition_by = []

    udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
    return self._evolve(
        query=self._query.generate(
            udf_obj.to_udf_wrapper(self._settings.batch_size),
            partition_by=processed_partition_by,
            is_aggregator=True,
            **self._settings.to_dict(),
        ),
        signal_schema=SignalSchema({"sys": Sys}) | udf_obj.output,
    )

apply

apply(func, *args, **kwargs)

Apply any function to the chain.

Useful for reusing in a chain of operations.

Example

import datachain as dc
def parse_stem(chain):
    return chain.map(
        lambda file: file.get_file_stem()
        output={"stem": str}
    )

chain = (
    dc.read_storage("s3://my-bucket")
    .apply(parse_stem)
    .filter(C("stem").glob("*cat*"))
)

Source code in datachain/lib/dc/datachain.py

def apply(self, func, *args, **kwargs):
    """Apply any function to the chain.

    Useful for reusing in a chain of operations.

    Example:
        ```py
        import datachain as dc
        def parse_stem(chain):
            return chain.map(
                lambda file: file.get_file_stem()
                output={"stem": str}
            )

        chain = (
            dc.read_storage("s3://my-bucket")
            .apply(parse_stem)
            .filter(C("stem").glob("*cat*"))
        )
        ```
    """
    return func(self, *args, **kwargs)

avg

avg(col: str) -> StandardType

Compute the average of a column.

Parameters:

col (str) –

The column to compute the average for.

Returns:

StandardType –

The average of the column values.

Example

average_size = chain.avg("file.size")
print(f"Average size: {average_size}")

Source code in datachain/lib/dc/datachain.py

def avg(self, col: str) -> StandardType:  # type: ignore[override]
    """Compute the average of a column.

    Parameters:
        col: The column to compute the average for.

    Returns:
        The average of the column values.

    Example:
        ```py
        average_size = chain.avg("file.size")
        print(f"Average size: {average_size}")
        ```
    """
    return self._extend_to_data_model("avg", col)

c

c(column: str | Column) -> Column

Returns Column instance attached to the current chain.

Source code in datachain/lib/dc/datachain.py

def c(self, column: str | Column) -> Column:
    """Returns Column instance attached to the current chain."""
    c = self.column(column) if isinstance(column, str) else self.column(column.name)
    c.table = self._query.table
    return c

chunk

chunk(index: int, total: int) -> Self

Split a chain into smaller chunks for e.g. parallelization.

Parameters:

index (int) –

The index of the chunk (0-indexed).
total (int) –

The total number of chunks.

Example

import datachain as dc

chain = dc.read_storage(...)
chunk_1 = query._chunk(0, 2)
chunk_2 = query._chunk(1, 2)

Note

Bear in mind that index is 0-indexed but total isn't. Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.

Source code in datachain/lib/dc/datachain.py

def chunk(self, index: int, total: int) -> "Self":
    """Split a chain into smaller chunks for e.g. parallelization.

    Parameters:
        index: The index of the chunk (0-indexed).
        total: The total number of chunks.

    Example:
        ```py
        import datachain as dc

        chain = dc.read_storage(...)
        chunk_1 = query._chunk(0, 2)
        chunk_2 = query._chunk(1, 2)
        ```

    Note:
        Bear in mind that `index` is 0-indexed but `total` isn't.
        Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
    """
    return self._evolve(query=self._query.chunk(index, total))

clone

clone() -> Self

Make a copy of the chain in a new table.

Source code in datachain/lib/dc/datachain.py

def clone(self) -> "Self":
    """Make a copy of the chain in a new table."""
    return self._evolve(query=self._query.clone(new_table=True))

column

column(name: str) -> Column

Returns Column instance with a type if name is found in current schema, otherwise raises an exception.

Source code in datachain/lib/dc/datachain.py

def column(self, name: str) -> Column:
    """Returns Column instance with a type if name is found in current schema,
    otherwise raises an exception.
    """
    if "." in name:
        name_path = name.split(".")
    elif DEFAULT_DELIMITER in name:
        name_path = name.split(DEFAULT_DELIMITER)
    else:
        name_path = [name]
    for path, type_, _, _ in self.signals_schema.get_flat_tree():
        if path == name_path:
            return Column(name, python_to_sql(type_))

    raise ValueError(f"Column with name {name} not found in the schema")

count

count() -> int

Return the number of rows in the chain.

Source code in datachain/lib/dc/datachain.py

def count(self) -> int:
    """Return the number of rows in the chain."""
    return self._query.count()

diff

diff(
    other: DataChain,
    on: str | Sequence[str],
    right_on: str | Sequence[str] | None = None,
    compare: str | Sequence[str] | None = None,
    right_compare: str | Sequence[str] | None = None,
    added: bool = True,
    deleted: bool = True,
    modified: bool = True,
    same: bool = False,
    status_col: str | None = None,
) -> DataChain

Calculate differences between two chains.

This method identifies records that are added, deleted, modified, or unchanged between two chains. It adds a status column with values: A=added, D=deleted, M=modified, S=same.

Parameters:

other (DataChain) –

Chain to compare against.
on (str | Sequence[str]) –

Column(s) to match records between chains.
right_on (str | Sequence[str] | None, default: None ) –

Column(s) in the other chain to match against. Defaults to on.
compare (str | Sequence[str] | None, default: None ) –

Column(s) to check for changes. If not specified,all columns are used.
right_compare (str | Sequence[str] | None, default: None ) –

Column(s) in the other chain to compare against. Defaults to values of compare.
added (bool, default: True ) –

Include records that exist in this chain but not in the other.
deleted (bool, default: True ) –

Include records that exist only in the other chain.
modified (bool, default: True ) –

Include records that exist in both but have different values.
same (bool, default: False ) –

Include records that are identical in both chains.
status_col (str, default: None ) –

Name for the status column showing differences.

Default behavior: By default, shows added, deleted, and modified records, but excludes unchanged records (same=False). Status column is not created.

Example

res = persons.diff(
    new_persons,
    on=["id"],
    right_on=["other_id"],
    compare=["name"],
    added=True,
    deleted=True,
    modified=True,
    same=True,
    status_col="diff"
)

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def diff(
    self,
    other: "DataChain",
    on: str | Sequence[str],
    right_on: str | Sequence[str] | None = None,
    compare: str | Sequence[str] | None = None,
    right_compare: str | Sequence[str] | None = None,
    added: bool = True,
    deleted: bool = True,
    modified: bool = True,
    same: bool = False,
    status_col: str | None = None,
) -> "DataChain":
    """Calculate differences between two chains.

    This method identifies records that are added, deleted, modified, or unchanged
    between two chains. It adds a status column with values: A=added, D=deleted,
    M=modified, S=same.

    Parameters:
        other: Chain to compare against.
        on: Column(s) to match records between chains.
        right_on: Column(s) in the other chain to match against. Defaults to `on`.
        compare: Column(s) to check for changes.
                 If not specified,all columns are used.
        right_compare: Column(s) in the other chain to compare against.
                 Defaults to values of `compare`.
        added (bool): Include records that exist in this chain but not in the other.
        deleted (bool): Include records that exist only in the other chain.
        modified (bool): Include records that exist in both
                 but have different values.
        same (bool): Include records that are identical in both chains.
        status_col (str): Name for the status column showing differences.

    Default behavior: By default, shows added, deleted, and modified records,
    but excludes unchanged records (same=False). Status column is not created.

    Example:
        ```py
        res = persons.diff(
            new_persons,
            on=["id"],
            right_on=["other_id"],
            compare=["name"],
            added=True,
            deleted=True,
            modified=True,
            same=True,
            status_col="diff"
        )
        ```
    """
    from datachain.diff import _compare

    return _compare(
        self,
        other,
        on,
        right_on=right_on,
        compare=compare,
        right_compare=right_compare,
        added=added,
        deleted=deleted,
        modified=modified,
        same=same,
        status_col=status_col,
    )

distinct

distinct(*args: str | Column, **kwargs) -> Self

Removes duplicate rows based on uniqueness of some input column(s) i.e if rows are found with the same value of input column(s), only one row is left in the result set.

Example

dc.distinct("file.path")
dc.distinct(file_name=func.path.name(C("file.path")))

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def distinct(  # type: ignore[override]
    self, *args: str | Column, **kwargs
) -> "Self":
    """Removes duplicate rows based on uniqueness of some input column(s)
    i.e if rows are found with the same value of input column(s), only one
    row is left in the result set.

    Example:
        ```py
        dc.distinct("file.path")
        dc.distinct(file_name=func.path.name(C("file.path")))
        ```
    """
    args = self._signal_names(args, "distinct()", named_expressions=True)
    if kwargs:
        return self._mutate("distinct()", **kwargs).distinct(*args, *kwargs)
    if not args:
        raise TypeError("distinct() expected at least 1 argument, got 0")

    return self._evolve(
        query=self._query.distinct(*self.signals_schema.resolve(*args).db_signals())
    )

exec

exec() -> Self

Execute the chain.

Source code in datachain/lib/dc/datachain.py

def exec(self) -> "Self":
    """Execute the chain."""
    return self._evolve(query=self._query.exec())

explode

explode(
    col: str,
    model_name: str | None = None,
    column: str | None = None,
    schema_sample_size: int = 1,
) -> DataChain

Explodes a column containing JSON objects (dict or str DataChain type) into individual columns based on the schema of the JSON. Schema is inferred from the first row of the column.

Parameters:

col (str) –

the name of the column containing JSON to be exploded.
model_name (str | None, default: None ) –

optional generated model name. By default generates the name automatically.
column (str | None, default: None ) –

optional generated column name. By default generates the name automatically.
schema_sample_size (int, default: 1 ) –

the number of rows to use for inferring the schema of the JSON (in case some fields are optional and it's not enough to analyze a single row).

Returns:

DataChain ( DataChain ) –

A new DataChain instance with the new set of columns.

Source code in datachain/lib/dc/datachain.py

def explode(
    self,
    col: str,
    model_name: str | None = None,
    column: str | None = None,
    schema_sample_size: int = 1,
) -> "DataChain":
    """Explodes a column containing JSON objects (dict or str DataChain type) into
       individual columns based on the schema of the JSON. Schema is inferred from
       the first row of the column.

    Args:
        col: the name of the column containing JSON to be exploded.
        model_name: optional generated model name.  By default generates the name
            automatically.
        column: optional generated column name. By default generates the
            name automatically.
        schema_sample_size: the number of rows to use for inferring the schema of
            the JSON (in case some fields are optional and it's not enough to
            analyze a single row).

    Returns:
        DataChain: A new DataChain instance with the new set of columns.
    """
    import pyarrow as pa

    from datachain.lib.arrow import schema_to_output

    json_values = self.limit(schema_sample_size).to_list(col)
    json_dicts = [
        json.loads(json_value) if isinstance(json_value, str) else json_value
        for (json_value,) in json_values
    ]

    if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
        raise TypeError(f"Column {col} should be a string or dict type with JSON")

    schema = pa.Table.from_pylist(json_dicts).schema
    output, original_names = schema_to_output(schema, None)

    if not model_name:
        model_name = f"{col.title()}ExplodedModel"

    model = dict_to_data_model(model_name, output, original_names)

    def json_to_model(json_value: str | dict):
        json_dict = (
            json.loads(json_value) if isinstance(json_value, str) else json_value
        )
        return model.model_validate(json_dict)

    if not column:
        column = f"{col}_expl"

    return self.map(json_to_model, params=col, output={column: model})

file_diff

file_diff(
    other: DataChain,
    on: str = "file",
    right_on: str | None = None,
    added: bool = True,
    modified: bool = True,
    deleted: bool = False,
    same: bool = False,
    status_col: str | None = None,
) -> DataChain

Calculate differences between two chains containing files.

This method is specifically designed for file chains. It uses file source and path to match files, and file version and etag to detect changes.

Parameters:

other (DataChain) –

Chain to compare against.
on (str, default: 'file' ) –

File column name in this chain. Default is "file".
right_on (str | None, default: None ) –

File column name in the other chain. Defaults to on.
added (bool, default: True ) –

Include files that exist in this chain but not in the other.
deleted (bool, default: False ) –

Include files that exist only in the other chain.
modified (bool, default: True ) –

Include files that exist in both but have different versions/etags.
same (bool, default: False ) –

Include files that are identical in both chains.
status_col (str, default: None ) –

Name for the status column showing differences (A=added, D=deleted, M=modified, S=same).

Default behavior: By default, includes only new files (added=True and modified=True). This is useful for incremental processing.

Example

diff = images.file_diff(
    new_images,
    on="file",
    right_on="other_file",
    added=True,
    deleted=True,
    modified=True,
    same=True,
    status_col="diff"
)

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def file_diff(
    self,
    other: "DataChain",
    on: str = "file",
    right_on: str | None = None,
    added: bool = True,
    modified: bool = True,
    deleted: bool = False,
    same: bool = False,
    status_col: str | None = None,
) -> "DataChain":
    """Calculate differences between two chains containing files.

    This method is specifically designed for file chains. It uses file `source`
    and `path` to match files, and file `version` and `etag` to detect changes.

    Parameters:
        other: Chain to compare against.
        on: File column name in this chain. Default is "file".
        right_on: File column name in the other chain. Defaults to `on`.
        added (bool): Include files that exist in this chain but not in the other.
        deleted (bool): Include files that exist only in the other chain.
        modified (bool): Include files that exist in both but have different
                         versions/etags.
        same (bool): Include files that are identical in both chains.
        status_col (str): Name for the status column showing differences
                          (A=added, D=deleted, M=modified, S=same).

    Default behavior: By default, includes only new files (added=True and
    modified=True). This is useful for incremental processing.

    Example:
        ```py
        diff = images.file_diff(
            new_images,
            on="file",
            right_on="other_file",
            added=True,
            deleted=True,
            modified=True,
            same=True,
            status_col="diff"
        )
        ```
    """
    on_file_signals = ["source", "path"]
    compare_file_signals = ["version", "etag"]

    def get_file_signals(file: str, signals):
        return [f"{file}.{c}" for c in signals]

    right_on = right_on or on

    on_cols = get_file_signals(on, on_file_signals)
    right_on_cols = get_file_signals(right_on, on_file_signals)
    compare_cols = get_file_signals(on, compare_file_signals)
    right_compare_cols = get_file_signals(right_on, compare_file_signals)

    return self.diff(
        other,
        on_cols,
        right_on=right_on_cols,
        compare=compare_cols,
        right_compare=right_compare_cols,
        added=added,
        deleted=deleted,
        modified=modified,
        same=same,
        status_col=status_col,
    )

filter

filter(*args: Any) -> Self

Filter the chain according to conditions.

Example

Basic usage with built-in operators

dc.filter(C("width") < 200)

Using glob to match patterns (case-sensitive, shell-style wildcards)

dc.filter(C("file.path").glob("*.jpg"))

Using like / ilike for SQL pattern matching with % and _ wildcards. like is case-sensitive; ilike is case-insensitive:

dc.filter(C("text").like("%empty vehicle%"))
dc.filter(C("text").ilike("%empty vehicle%"))

Using regexp for regular-expression matching. Case-sensitive by default; prepend the (?i) inline flag for case-insensitive:

dc.filter(C("file.name").regexp(r"^IMG_\d+\.jpg$"))
dc.filter(C("file.name").regexp(r"(?i)^img_\d+\.jpg$"))

Using in to match lists

ids = [1,2,3]
dc.filter(C("experiment_id").in_(ids))

Using datachain.func

from datachain.func import string
dc.filter(string.length(C("file.path")) > 5)

Combining filters with "or"

dc.filter(
    C("file.path").glob("cat*") |
    C("file.path").glob("dog*")
)

dc.filter(dc.func.or_(
    C("file.path").glob("cat*"),
    C("file.path").glob("dog*")
))

Combining filters with "and"

dc.filter(
    C("file.path").glob("*.jpg"),
    string.length(C("file.path")) > 5
)

dc.filter(
    C("file.path").glob("*.jpg") &
    (string.length(C("file.path")) > 5)
)

dc.filter(dc.func.and_(
    C("file.path").glob("*.jpg"),
    string.length(C("file.path")) > 5
))

Combining filters with "not"

dc.filter(~(C("file.path").glob("*.jpg")))

Quick reference for column-level filter operators:

Operator	Wildcards / syntax	Case sensitive?
`glob(pattern)`	shell: `*`, `?`, `[…]`	yes
`like(pattern)`	SQL: `%`, `_`	yes
`ilike(pattern)`	SQL: `%`, `_`	no
`regexp(pattern)`	regular expression	yes (use `(?i)`)
`in_(values)`	exact values	yes

Source code in datachain/lib/dc/datachain.py

@resolve_columns
def filter(self, *args: Any) -> "Self":
    r"""Filter the chain according to conditions.

    Example:
        Basic usage with built-in operators
        ```py
        dc.filter(C("width") < 200)
        ```

        Using glob to match patterns (case-sensitive, shell-style wildcards)
        ```py
        dc.filter(C("file.path").glob("*.jpg"))
        ```

        Using `like` / `ilike` for SQL pattern matching with `%` and `_`
        wildcards. `like` is case-sensitive; `ilike` is case-insensitive:
        ```py
        dc.filter(C("text").like("%empty vehicle%"))
        dc.filter(C("text").ilike("%empty vehicle%"))
        ```

        Using `regexp` for regular-expression matching. Case-sensitive by
        default; prepend the `(?i)` inline flag for case-insensitive:
        ```py
        dc.filter(C("file.name").regexp(r"^IMG_\d+\.jpg$"))
        dc.filter(C("file.name").regexp(r"(?i)^img_\d+\.jpg$"))
        ```

        Using in to match lists
        ```py
        ids = [1,2,3]
        dc.filter(C("experiment_id").in_(ids))
        ```

        Using `datachain.func`
        ```py
        from datachain.func import string
        dc.filter(string.length(C("file.path")) > 5)
        ```

        Combining filters with "or"
        ```py
        dc.filter(
            C("file.path").glob("cat*") |
            C("file.path").glob("dog*")
        )
        ```

        ```py
        dc.filter(dc.func.or_(
            C("file.path").glob("cat*"),
            C("file.path").glob("dog*")
        ))
        ```

        Combining filters with "and"
        ```py
        dc.filter(
            C("file.path").glob("*.jpg"),
            string.length(C("file.path")) > 5
        )
        ```

        ```py
        dc.filter(
            C("file.path").glob("*.jpg") &
            (string.length(C("file.path")) > 5)
        )
        ```

        ```py
        dc.filter(dc.func.and_(
            C("file.path").glob("*.jpg"),
            string.length(C("file.path")) > 5
        ))
        ```

        Combining filters with "not"
        ```py
        dc.filter(~(C("file.path").glob("*.jpg")))
        ```

        Quick reference for column-level filter operators:

        | Operator          | Wildcards / syntax     | Case sensitive?  |
        |-------------------|------------------------|------------------|
        | `glob(pattern)`   | shell: `*`, `?`, `[…]` | yes              |
        | `like(pattern)`   | SQL: `%`, `_`          | yes              |
        | `ilike(pattern)`  | SQL: `%`, `_`          | **no**           |
        | `regexp(pattern)` | regular expression     | yes (use `(?i)`) |
        | `in_(values)`     | exact values           | yes              |
    """
    return self._evolve(query=self._query.filter(*args))

gen

gen(
    func: Callable | Generator | None = None,
    params: str | Sequence[str] | None = None,
    output: OutputType = None,
    **signal_map
) -> Self

Apply a function to each row to create new rows (with potentially new signals). The function needs to return a new objects for each of the new rows. It returns a chain itself with new signals.

Input-output relationship: 1:N

This method is similar to map(), uses the same list of parameters, but with one key differences: It produces a sequence of rows for each input row (like extracting multiple file records from a single tar file or bounding boxes from a single image file).

Example

chain = chain.gen(
    line=lambda file: [l for l in file.read().split("\n")],
    output=str,
)
chain.save("new_dataset")

Source code in datachain/lib/dc/datachain.py

def gen(
    self,
    func: Callable | Generator | None = None,
    params: str | Sequence[str] | None = None,
    output: OutputType = None,
    **signal_map,
) -> "Self":
    r"""Apply a function to each row to create new rows (with potentially new
    signals). The function needs to return a new objects for each of the new rows.
    It returns a chain itself with new signals.

    Input-output relationship: 1:N

    This method is similar to `map()`, uses the same list of parameters, but with
    one key differences: It produces a sequence of rows for each input row (like
    extracting multiple file records from a single tar file or bounding boxes from a
    single image file).

    Example:
        ```py
        chain = chain.gen(
            line=lambda file: [l for l in file.read().split("\n")],
            output=str,
        )
        chain.save("new_dataset")
        ```
    """
    udf_obj = self._udf_to_obj(Generator, func, params, output, signal_map)
    if (prefetch := self._settings.prefetch) is not None:
        udf_obj.prefetch = prefetch
    return self._evolve(
        query=self._query.generate(
            udf_obj.to_udf_wrapper(self._settings.batch_size),
            **self._settings.to_dict(),
        ),
        signal_schema=SignalSchema({"sys": Sys}) | udf_obj.output,
    )

group_by

group_by(
    *args: Any,
    partition_by: (
        str
        | Func
        | ColumnExpr
        | Sequence[str | Func | ColumnExpr]
        | None
    ) = None,
    **kwargs: Func
) -> Self

Group rows by specified set of signals and return new signals with aggregated values.

The supported functions

count(), sum(), avg(), min(), max(), any_value(), collect(), concat()

Example

chain = chain.group_by(
    cnt=func.count(),
    partition_by=("file_source", "file_ext"),
)

Using complex signals:

chain = chain.group_by(
    total_size=func.sum("file.size"),
    count=func.count(),
    partition_by="file",  # Uses column name, expands to File's unique keys
)

Source code in datachain/lib/dc/datachain.py

@delta_disabled  # type: ignore[arg-type]
def group_by(  # noqa: C901, PLR0912, PLR0915
    self,
    *args: Any,
    partition_by: (
        str | Func | ColumnExpr | Sequence[str | Func | ColumnExpr] | None
    ) = None,
    **kwargs: Func,
) -> "Self":
    """Group rows by specified set of signals and return new signals
    with aggregated values.

    The supported functions:
       count(), sum(), avg(), min(), max(), any_value(), collect(), concat()

    Example:
        ```py
        chain = chain.group_by(
            cnt=func.count(),
            partition_by=("file_source", "file_ext"),
        )
        ```

        Using complex signals:
        ```py
        chain = chain.group_by(
            total_size=func.sum("file.size"),
            count=func.count(),
            partition_by="file",  # Uses column name, expands to File's unique keys
        )
        ```
    """
    if args:
        if isinstance(args[0], Func):
            raise self._named_expression_error("group_by()", args[0])
        raise DataChainParamsError(
            "group_by() does not accept positional arguments "
            f"of type {type(args[0]).__name__}; pass grouping columns with "
            "`partition_by=` and aggregate expressions as keyword arguments"
        )

    if partition_by is None:
        partition_by = []
    elif isinstance(partition_by, (str, Func, ColumnExpr)):
        partition_by = [partition_by]

    partition_by_columns: list[Column] = []
    signal_columns: list[Column] = []
    schema_fields: dict[str, DataType] = {}
    keep_columns: list[str] = []
    partial_fields: list[str] = []  # Track specific fields for partial creation
    schema_partition_by: list[str] = []
    # Start counter past any existing gr_N columns and kwargs to avoid collisions
    partition_counter = 0
    for name in (*self.signals_schema.values, *kwargs):
        if name.startswith("gr_"):
            try:
                partition_counter = max(partition_counter, int(name[3:]) + 1)
            except ValueError:
                pass

    for col in partition_by:
        if isinstance(col, str):
            columns = self.signals_schema.db_signals(name=col, as_columns=True)
            if not columns:
                raise SignalResolvingError([col], "is not found")
            partition_by_columns.extend(cast("list[Column]", columns))
            # GROUP BY the ancestor sentinel so the result keeps _type_tag.
            for leaf in cast("list[Column]", columns):
                sentinel = self.signals_schema.optional_parent_sentinel(leaf.name)
                if sentinel:
                    partition_by_columns.append(Column(sentinel))

            # For nested field references (e.g., "nested.level1.name"),
            # we need to distinguish between:
            # 1. References to fields within a complex signal (create partials)
            # 2. Deep nested references that should be flattened
            if "." in col:
                # Split the column reference to analyze it
                parts = col.split(".")
                parent_signal = parts[0]
                parent_type = self.signals_schema.values.get(parent_signal)

                if ModelStore.is_partial(parent_type):
                    if parent_signal not in keep_columns:
                        keep_columns.append(parent_signal)
                    partial_fields.append(col)
                    schema_partition_by.append(col)
                else:
                    # BaseModel or other - add flattened columns directly
                    for column in cast("list[Column]", columns):
                        col_type = self.signals_schema.get_column_type(column.name)
                        schema_fields[column.name] = col_type
                    schema_partition_by.append(col)
            else:
                # simple signal - but we need to check if it's a complex signal
                # complex signal - only include the columns used for partitioning
                col_type = self.signals_schema.get_column_type(
                    col, with_subtree=True
                )
                if isinstance(col_type, type) and issubclass(col_type, BaseModel):
                    # Complex signal - add only the partitioning columns
                    for column in cast("list[Column]", columns):
                        col_type = self.signals_schema.get_column_type(column.name)
                        schema_fields[column.name] = col_type
                    schema_partition_by.append(col)
                # Simple signal - keep the entire signal
                else:
                    if col not in keep_columns:
                        keep_columns.append(col)
                    schema_partition_by.append(col)
        elif isinstance(col, Function):
            label = col.col_label
            if not label:
                label = f"gr_{partition_counter}"
                partition_counter += 1
            column = col.get_column(self.signals_schema, label=label)
            partition_by_columns.append(column)
            signal_columns.append(column)
            schema_fields[column.name] = column.type.python_type
        elif isinstance(col, ColumnExpr):
            col_label = f"gr_{partition_counter}"
            partition_counter += 1
            enriched = self.signals_schema.enrich_expr_types(col)
            labeled = cast("Column", enriched.label(col_label))
            inferred = sql_to_python(enriched)
            partition_by_columns.append(labeled)
            signal_columns.append(labeled)
            schema_fields[col_label] = inferred
        else:
            raise DataChainColumnError(
                col,
                (
                    f"partition_by column {col} has type {type(col)}"
                    " but expected str, Function, or ColumnExpr"
                ),
            )

    if not kwargs:
        raise ValueError("At least one column should be provided for group_by")

    partition_output_names = {c.name for c in signal_columns}
    overlap = partition_output_names & set(kwargs.keys())
    if overlap:
        name = next(iter(overlap))
        raise DataChainColumnError(
            name,
            f"partition_by name '{name}' conflicts with aggregation column name",
        )

    for col_name, func in kwargs.items():
        if not isinstance(func, Func):
            raise DataChainColumnError(
                col_name,
                f"Column {col_name} has type {type(func)} but expected Func object",
            )
        column = func.get_column(self.signals_schema, label=col_name)
        signal_columns.append(column)
        schema_fields[col_name] = func.get_result_type(self.signals_schema)

    signal_schema = self.signals_schema.group_by(
        schema_partition_by, signal_columns
    )

    return self._evolve(
        query=self._query.group_by(signal_columns, partition_by_columns),
        signal_schema=signal_schema,
    )

limit

limit(n: int) -> Self

Return the first n rows of the chain.

If the chain is unordered, which rows are returned is undefined. If the chain has less than n rows, the whole chain is returned.

Parameters:

n (int) –

Number of rows to return.

Source code in datachain/lib/dc/datachain.py

def limit(self, n: int) -> "Self":
    """Return the first `n` rows of the chain.

    If the chain is unordered, which rows are returned is undefined.
    If the chain has less than `n` rows, the whole chain is returned.

    Parameters:
        n (int): Number of rows to return.
    """
    return self._evolve(query=self._query.limit(n))

map

map(
    func: Callable | None = None,
    params: str | Sequence[str] | None = None,
    output: OutputType = None,
    **signal_map: Any
) -> Self

Apply a function to each row to create new signals. The function should return a new object for each row. It returns a chain itself with new signals.

Input-output relationship: 1:1

Parameters:

func (Callable | None, default: None ) –

Function applied to each row.
params (str | Sequence[str] | None, default: None ) –

List of column names used as input for the function. Default is taken from function signature.
output (OutputType, default: None ) –

Dictionary defining new signals and their corresponding types. Default type is taken from function signature. Default can be also taken from kwargs - **signal_map (see below). If signal name is defined using signal_map (see below) only a single type value can be used.
**signal_map (Any, default: {} ) –

kwargs can be used to define func together with its return signal name in format of map(my_sign=my_func). This helps define signal names and functions in a nicer way.

Example

Using signal_map and single type in output:

chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
chain.save("new_dataset")

Using func and output as a map:

chain = chain.map(
    lambda name: name.split("."), output={"stem": str, "ext": str}
)
chain.save("new_dataset")

Source code in datachain/lib/dc/datachain.py

def map(
    self,
    func: Callable | None = None,
    params: str | Sequence[str] | None = None,
    output: OutputType = None,
    **signal_map: Any,
) -> "Self":
    """Apply a function to each row to create new signals. The function should
    return a new object for each row. It returns a chain itself with new signals.

    Input-output relationship: 1:1

    Parameters:
        func: Function applied to each row.
        params: List of column names used as input for the function. Default
                is taken from function signature.
        output: Dictionary defining new signals and their corresponding types.
                Default type is taken from function signature. Default can be also
                taken from kwargs - **signal_map (see below).
                If signal name is defined using signal_map (see below) only a single
                type value can be used.
        **signal_map: kwargs can be used to define `func` together with its return
                signal name in format of `map(my_sign=my_func)`. This helps define
                signal names and functions in a nicer way.

    Example:
        Using signal_map and single type in output:
        ```py
        chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
        chain.save("new_dataset")
        ```

        Using func and output as a map:
        ```py
        chain = chain.map(
            lambda name: name.split("."), output={"stem": str, "ext": str}
        )
        chain.save("new_dataset")
        ```
    """
    udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
    if (prefetch := self._settings.prefetch) is not None:
        udf_obj.prefetch = prefetch

    sys_schema = SignalSchema({"sys": Sys})
    return self._evolve(
        query=self._query.add_signals(
            udf_obj.to_udf_wrapper(self._settings.batch_size),
            **self._settings.to_dict(),
        ),
        signal_schema=sys_schema | self.signals_schema | udf_obj.output,
    )

max

max(col: str) -> StandardType

Compute the maximum of a column.

Parameters:

col (str) –

The column to compute the maximum for.

Returns:

StandardType –

The maximum value in the column.

Example

max_size = chain.max("file.size")
print(f"Maximum size: {max_size}")

Source code in datachain/lib/dc/datachain.py

def max(self, col: str) -> StandardType:  # type: ignore[override]
    """Compute the maximum of a column.

    Parameters:
        col: The column to compute the maximum for.

    Returns:
        The maximum value in the column.

    Example:
        ```py
        max_size = chain.max("file.size")
        print(f"Maximum size: {max_size}")
        ```
    """
    return self._extend_to_data_model("max", col)

merge

merge(
    right_ds: DataChain,
    on: MergeColType | Sequence[MergeColType],
    right_on: (
        MergeColType | Sequence[MergeColType] | None
    ) = None,
    inner=False,
    full=False,
    rname="right_",
) -> Self

Merge two chains based on the specified criteria.

Parameters:

right_ds (DataChain) –

Chain to join with.
on (MergeColType | Sequence[MergeColType]) –

Predicate ("column.name", C("column.name"), or Func) or list of Predicates to join on. If both chains have the same predicates then this predicate is enough for the join. Otherwise, right_on parameter has to specify the predicates for the other chain.
right_on (MergeColType | Sequence[MergeColType] | None, default: None ) –

Optional predicate or list of Predicates for the right_ds to join.
inner (bool, default: False ) –

Whether to run inner join or outer join.
full (bool, default: False ) –

Whether to run full outer join.
rname (str, default: 'right_' ) –

Name prefix for conflicting signal names.

Examples:

meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
                      right_on=(C.name, C.pq__index))

imgs.merge(captions,
           on=func.path.file_stem(imgs.c("file.path")),
           right_on=func.path.file_stem(captions.c("file.path"))

)

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def merge(
    self,
    right_ds: "DataChain",
    on: MergeColType | Sequence[MergeColType],
    right_on: MergeColType | Sequence[MergeColType] | None = None,
    inner=False,
    full=False,
    rname="right_",
) -> "Self":
    """Merge two chains based on the specified criteria.

    Parameters:
        right_ds: Chain to join with.
        on: Predicate ("column.name", C("column.name"), or Func) or list of
            Predicates to join on. If both chains have the same predicates then
            this predicate is enough for the join. Otherwise, `right_on` parameter
            has to specify the predicates for the other chain.
        right_on: Optional predicate or list of Predicates for the `right_ds`
            to join.
        inner (bool): Whether to run inner join or outer join.
        full (bool): Whether to run full outer join.
        rname (str): Name prefix for conflicting signal names.

    Examples:
        ```py
        meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
                              right_on=(C.name, C.pq__index))
        ```

        ```py
        imgs.merge(captions,
                   on=func.path.file_stem(imgs.c("file.path")),
                   right_on=func.path.file_stem(captions.c("file.path"))
        ```
    )
    """
    if on is None:
        raise DatasetMergeError(["None"], None, "'on' must be specified")

    on = _validate_merge_on(on, self)
    if not on:
        raise DatasetMergeError(
            on,
            right_on,
            (
                "'on' must be 'str', 'Func' or 'Sequence' object "
                f"but got type '{type(on)}'"
            ),
        )

    if right_on is not None:
        right_on = _validate_merge_on(right_on, right_ds)
        if not right_on:
            raise DatasetMergeError(
                on,
                right_on,
                "'right_on' must be 'str', 'Func' or 'Sequence' object"
                f" but got type '{type(right_on)}'",
            )

        if len(right_on) != len(on):
            raise DatasetMergeError(
                on, right_on, "'on' and 'right_on' must have the same length'"
            )

    if self == right_ds:
        right_ds = right_ds.clone()

    errors = []

    def _resolve(
        ds: DataChain,
        col: str | Function | ColumnExpr,
        side: str | None,
    ):
        try:
            if isinstance(col, Function):
                return ds.c(col.get_column())
            return ds.c(col) if isinstance(col, (str, C)) else col
        except ValueError:
            if side:
                errors.append(f"{_get_merge_error_str(col)} in {side}")

    ops = [
        _resolve(self, left, "left")
        == _resolve(right_ds, right, "right" if right_on else None)
        for left, right in zip(on, right_on or on, strict=False)
    ]

    if errors:
        raise DatasetMergeError(
            on, right_on, f"Could not resolve {', '.join(errors)}"
        )

    query = self._query.join(
        right_ds._query, sqlalchemy.and_(*ops), inner, full, rname
    )
    query.feature_schema = None
    ds = self._evolve(query=query)

    # Note: merge drops sys signals from both sides, make sure to not include it
    # in the resulting schema
    signals_schema = self.signals_schema.clone_without_sys_signals()
    right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()

    ds.signals_schema = signals_schema.merge(
        right_signals_schema,
        rname,
        left_nullable=full,
        right_nullable=not inner,
    )

    return ds

min

min(col: str) -> StandardType

Compute the minimum of a column.

Parameters:

col (str) –

The column to compute the minimum for.

Returns:

StandardType –

The minimum value in the column.

Example

min_size = chain.min("file.size")
print(f"Minimum size: {min_size}")

Source code in datachain/lib/dc/datachain.py

def min(self, col: str) -> StandardType:  # type: ignore[override]
    """Compute the minimum of a column.

    Parameters:
        col: The column to compute the minimum for.

    Returns:
        The minimum value in the column.

    Example:
        ```py
        min_size = chain.min("file.size")
        print(f"Minimum size: {min_size}")
        ```
    """
    return self._extend_to_data_model("min", col)

mutate

mutate(*args, **kwargs) -> Self

Create or modify signals based on existing signals.

This method is vectorized and more efficient compared to map(), and it does not extract or download any data from the internal database. However, it can only utilize predefined built-in functions and their combinations.

Supported functions

Numerical: +, -, *, /, rand(), avg(), count(), func(), greatest(), least(), max(), min(), sum() String: length(), split(), replace(), regexp_replace() Filename: name(), parent(), file_stem(), file_ext() Array: length(), sip_hash_64(), euclidean_distance(), cosine_distance() Window: row_number(), rank(), dense_rank(), first()

Example:

 dc.mutate(
    area=Column("image.height") * Column("image.width"),
    extension=file_ext(Column("file.path")),
    dist=cosine_distance(embedding_text, embedding_image)
)

Window function example:

window = func.window(partition_by="file.parent", order_by="file.size")
dc.mutate(
    row_number=func.row_number().over(window),
)

This method can be also used to rename signals. If the Column("name") provided as value for the new signal - the old signal will be dropped. Otherwise a new signal is created. Exception, if the old signal is nested one (e.g. C("file.path")), it will be kept to keep the object intact.

Example:

 dc.mutate(
    newkey=Column("oldkey") # drops oldkey
)

 dc.mutate(
    size=Column("file.size") # keeps `file.size`
)

Source code in datachain/lib/dc/datachain.py

def mutate(self, *args, **kwargs) -> "Self":
    """Create or modify signals based on existing signals.

    This method is vectorized and more efficient compared to map(), and it does not
    extract or download any data from the internal database. However, it can only
    utilize predefined built-in functions and their combinations.

    Supported functions:
       Numerical:   +, -, *, /, rand(), avg(), count(), func(),
                    greatest(), least(), max(), min(), sum()
       String:      length(), split(), replace(), regexp_replace()
       Filename:    name(), parent(), file_stem(), file_ext()
       Array:       length(), sip_hash_64(), euclidean_distance(),
                    cosine_distance()
       Window:      row_number(), rank(), dense_rank(), first()

    Example:
    ```py
     dc.mutate(
        area=Column("image.height") * Column("image.width"),
        extension=file_ext(Column("file.path")),
        dist=cosine_distance(embedding_text, embedding_image)
    )
    ```

    Window function example:
    ```py
    window = func.window(partition_by="file.parent", order_by="file.size")
    dc.mutate(
        row_number=func.row_number().over(window),
    )
    ```

    This method can be also used to rename signals. If the Column("name") provided
    as value for the new signal - the old signal will be dropped. Otherwise a new
    signal is created. Exception, if the old signal is nested one (e.g.
    `C("file.path")`), it will be kept to keep the object intact.

    Example:
    ```py
     dc.mutate(
        newkey=Column("oldkey") # drops oldkey
    )
    ```

    ```py
     dc.mutate(
        size=Column("file.size") # keeps `file.size`
    )
    ```
    """
    if args:
        raise self._named_expression_error("mutate()", args[0])
    return self._mutate("mutate()", **kwargs)

offset

offset(offset: int) -> Self

Return the results starting with the offset row.

If the chain is unordered, which rows are skipped in undefined. If the chain has less than offset rows, the result is an empty chain.

Parameters:

offset (int) –

Number of rows to skip.

Source code in datachain/lib/dc/datachain.py

def offset(self, offset: int) -> "Self":
    """Return the results starting with the offset row.

    If the chain is unordered, which rows are skipped in undefined.
    If the chain has less than `offset` rows, the result is an empty chain.

    Parameters:
        offset (int): Number of rows to skip.
    """
    return self._evolve(query=self._query.offset(offset))

order_by

order_by(*args, descending: bool = False) -> Self

Orders by specified set of columns.

Parameters:

descending (bool, default: False ) –

Whether to sort in descending order or not.

Example

dc.order_by("similarity_score", descending=True).limit(10)

Note

Order is not guaranteed when steps are added after an order_by statement. I.e. when using read_dataset an order_by statement should be used if the order of the records in the chain is important. Using order_by directly before limit, to_list and similar methods will give expected results. See https://github.com/datachain-ai/datachain/issues/477 for further details.

Source code in datachain/lib/dc/datachain.py

@resolve_columns
def order_by(self, *args, descending: bool = False) -> "Self":
    """Orders by specified set of columns.

    Parameters:
        descending (bool): Whether to sort in descending order or not.

    Example:
        ```py
        dc.order_by("similarity_score", descending=True).limit(10)
        ```

    Note:
        Order is not guaranteed when steps are added after an `order_by` statement.
        I.e. when using `read_dataset` an `order_by` statement should be used if
        the order of the records in the chain is important.
        Using `order_by` directly before `limit`, `to_list` and similar methods
        will give expected results.
        See https://github.com/datachain-ai/datachain/issues/477
        for further details.
    """
    resolved: list[Any] = []
    for arg in args:
        if isinstance(arg, str):
            name = arg
        elif isinstance(arg, Column):
            name = arg.name
        else:
            name = None
        wrapped = (
            self.signals_schema.order_by_column(name, descending=descending)
            if name is not None
            else None
        )
        if wrapped is not None:
            resolved.append(wrapped)
        elif descending:
            resolved.append(sqlalchemy.desc(arg))
        else:
            resolved.append(arg)

    return self._evolve(query=self._query.order_by(*resolved))

parse_tabular

parse_tabular(
    output: OutputType = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    nrows: int | None = None,
    **kwargs: Any
) -> Self

Generate chain from list of tabular files.

Parameters:

output (OutputType, default: None ) –

Dictionary or feature class defining column names and their corresponding types. List of column names is also accepted, in which case types will be inferred.
column (str, default: '' ) –

Generated column name.
model_name (str, default: '' ) –

Generated model name.
source (bool, default: True ) –

Whether to include info about the source file.
nrows (int | None, default: None ) –

Optional row limit.
kwargs (Any, default: {} ) –

Parameters to pass to pyarrow.dataset.dataset.

Example

Reading a json lines file:

import datachain as dc
chain = dc.read_storage("s3://mybucket/file.jsonl")
chain = chain.parse_tabular(format="json")

Reading a filtered list of files as a dataset:

import datachain as dc

chain = dc.read_storage("s3://mybucket")
chain = chain.filter(dc.C("file.path").glob("*.jsonl"))
chain = chain.parse_tabular(format="json")

Source code in datachain/lib/dc/datachain.py

def parse_tabular(
    self,
    output: OutputType = None,
    column: str = "",
    model_name: str = "",
    source: bool = True,
    nrows: int | None = None,
    **kwargs: Any,
) -> "Self":
    """Generate chain from list of tabular files.

    Parameters:
        output: Dictionary or feature class defining column names and their
            corresponding types. List of column names is also accepted, in which
            case types will be inferred.
        column: Generated column name.
        model_name: Generated model name.
        source: Whether to include info about the source file.
        nrows: Optional row limit.
        kwargs: Parameters to pass to pyarrow.dataset.dataset.

    Example:
        Reading a json lines file:
        ```py
        import datachain as dc
        chain = dc.read_storage("s3://mybucket/file.jsonl")
        chain = chain.parse_tabular(format="json")
        ```

        Reading a filtered list of files as a dataset:
        ```py
        import datachain as dc

        chain = dc.read_storage("s3://mybucket")
        chain = chain.filter(dc.C("file.path").glob("*.jsonl"))
        chain = chain.parse_tabular(format="json")
        ```
    """
    from pyarrow.dataset import CsvFileFormat, JsonFileFormat

    from datachain.lib.arrow import (
        ArrowGenerator,
        fix_pyarrow_format,
        infer_schema,
        schema_to_output,
    )

    parse_options = kwargs.pop("parse_options", None)
    if format := kwargs.get("format"):
        kwargs["format"] = fix_pyarrow_format(format, parse_options)

    if (
        nrows
        and format not in ["csv", "json"]
        and not isinstance(format, (CsvFileFormat, JsonFileFormat))
    ):
        raise DatasetPrepareError(
            self.name,
            "error in `parse_tabular` - "
            "`nrows` only supported for csv and json formats.",
        )

    if "file" not in self.schema or not self.count():
        raise DatasetPrepareError(self.name, "no files to parse.")

    schema = None
    col_names = output if isinstance(output, Sequence) else None
    if col_names or not output:
        try:
            schema = infer_schema(self, **kwargs, parse_options=parse_options)
            output, _ = schema_to_output(schema, col_names)
        except ValueError as e:
            raise DatasetPrepareError(self.name, e) from e

    if isinstance(output, dict):
        model_name = model_name or column or ""
        model = dict_to_data_model(model_name, output)
        output = model
    else:
        model = output  # type: ignore[assignment]

    if column:
        output = {column: model}  # type: ignore[dict-item]
    elif isinstance(output, type(BaseModel)):
        output = {
            name: info.annotation  # type: ignore[misc]
            for name, info in output.model_fields.items()
        }

    if source:
        output = {"source": ArrowRow} | output  # type: ignore[assignment,operator]

    # disable prefetch if nrows is set
    settings = {"prefetch": 0} if nrows else {}
    return self.settings(**settings).gen(  # type: ignore[arg-type]
        ArrowGenerator(
            schema,
            model,
            source,
            nrows,
            parse_options=parse_options,
            **kwargs,
        ),
        output=output,
    )

persist

persist() -> Self

Saves temporary chain that will be removed after the process ends. Temporary datasets are useful for optimization, for example when we have multiple chains starting with identical sub-chain. We can then persist that common chain and use it to calculate other chains, to avoid re-calculation every time. It returns the chain itself.

Source code in datachain/lib/dc/datachain.py

def persist(self) -> "Self":
    """Saves temporary chain that will be removed after the process ends.
    Temporary datasets are useful for optimization, for example when we have
    multiple chains starting with identical sub-chain. We can then persist that
    common chain and use it to calculate other chains, to avoid re-calculation
    every time.
    It returns the chain itself.
    """
    schema = self.signals_schema.clone_without_sys_signals().serialize()
    project = self.session.catalog.metastore.get_project(
        self.project_name,
        self.namespace_name,
        create=True,
    )
    return self._evolve(
        query=self._query.save(project=project, feature_schema=schema),
        signal_schema=self.signals_schema | SignalSchema({"sys": Sys}),
    )

print_schema

print_schema(file: IO | None = None) -> None

Deprecated. Use print(chain.schema).

Source code in datachain/lib/dc/datachain.py

def print_schema(self, file: IO | None = None) -> None:
    """Deprecated. Use ``print(chain.schema)``."""
    warnings.warn(
        "DataChain.print_schema() is deprecated; use print(chain.schema) instead.",
        DeprecationWarning,
        stacklevel=2,
    )
    print(self.schema, file=file)

reset_settings

reset_settings(settings: Settings | None = None) -> Self

Reset all chain settings to default values.

Source code in datachain/lib/dc/datachain.py

def reset_settings(self, settings: Settings | None = None) -> "Self":
    """Reset all chain settings to default values."""
    self._settings = settings or Settings()
    return self

results

results() -> list[tuple[Any, ...]]

results(
    *,
    row_factory: Callable[[list[str], tuple[Any, ...]], _T]
) -> list[_T]

results(
    *,
    row_factory: Callable[[list[str], tuple[Any, ...]], _T],
    include_hidden: bool
) -> list[_T]

results(*, include_hidden: bool) -> list[tuple[Any, ...]]

results(*, row_factory=None, include_hidden=True)

Return all rows, optionally built via row_factory.

Source code in datachain/lib/dc/datachain.py

def results(self, *, row_factory=None, include_hidden=True):
    """Return all rows, optionally built via ``row_factory``."""
    if row_factory is None:
        return list(self._leaf_values(include_hidden=include_hidden))
    return list(
        self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
    )

sample

sample(n: int) -> Self

Return a random sample from the chain.

Parameters:

n (int) –

Number of samples to draw.

Note

Samples are not deterministic, and streamed/paginated queries or multiple workers will draw samples with replacement.

Source code in datachain/lib/dc/datachain.py

def sample(self, n: int) -> "Self":
    """Return a random sample from the chain.

    Parameters:
        n: Number of samples to draw.

    Note:
        Samples are not deterministic, and streamed/paginated queries or
        multiple workers will draw samples with replacement.
    """
    return self._evolve(query=self._query.sample(n))

save

save(
    name: str,
    version: str | None = None,
    description: str | None = None,
    attrs: list[str] | None = None,
    update_version: str | None = "patch",
    **kwargs
) -> DataChain

Save to a Dataset. It returns the chain itself.

Parameters:

name (str) –

dataset name. This can be either a fully qualified name, including the namespace and project, or just a regular dataset name. In the latter case, the namespace and project will be taken from the settings (if specified) or from the default values otherwise.
version (str | None, default: None ) –

version of a dataset. If version is not specified and dataset already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
description (str | None, default: None ) –

description of a dataset.
attrs (list[str] | None, default: None ) –

attributes of a dataset. They can be without value, e.g "NLP", or with a value, e.g "location=US".
update_version (str | None, default: 'patch' ) –

which part of the dataset version to automatically increase. Available values: major, minor or patch. Default is patch.

Source code in datachain/lib/dc/datachain.py

def save(  # type: ignore[override]
    self,
    name: str,
    version: str | None = None,
    description: str | None = None,
    attrs: list[str] | None = None,
    update_version: str | None = "patch",
    **kwargs,
) -> "DataChain":
    """Save to a Dataset. It returns the chain itself.

    Parameters:
        name: dataset name. This can be either a fully qualified name, including
            the namespace and project, or just a regular dataset name. In the latter
            case, the namespace and project will be taken from the settings
            (if specified) or from the default values otherwise.
        version: version of a dataset. If version is not specified and dataset
            already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
        description: description of a dataset.
        attrs: attributes of a dataset. They can be without value, e.g "NLP",
            or with a value, e.g "location=US".
        update_version: which part of the dataset version to automatically increase.
            Available values: `major`, `minor` or `patch`. Default is `patch`.
    """
    if self._settings.ephemeral:
        raise RuntimeError(
            "Cannot save datasets in ephemeral mode. "
            "Remove .settings(ephemeral=True) to save datasets."
        )

    catalog = self.session.catalog

    result = None  # result chain that will be returned at the end

    # Version validation
    self._validate_version(version)
    self._validate_update_version(update_version)

    namespace_name, project_name, name = catalog.get_full_dataset_name(
        name,
        namespace_name=self._settings.namespace,
        project_name=self._settings.project,
    )
    project = self._get_or_create_project(namespace_name, project_name)

    # Resolve all listings (including sub-queries in union/join) before hashing
    self._query.resolve_all_listings()

    # Calculate hash including dataset name and job context to avoid conflicts

    base_hash = self._query.hash()
    name_salt = f"{namespace_name}/{project_name}/{name}"
    if version is not None:
        name_salt += f"/{version}"
    _hash = hashlib.sha256((base_hash + name_salt).encode("utf-8")).hexdigest()

    result = self._resolve_checkpoint(name, project, _hash, kwargs)
    if bool(result):
        print(
            f"Checkpoint found for dataset '{name}', skipping creation",
            file=sys.stderr,
        )

    # Schema preparation
    schema = self.signals_schema.clone_without_sys_signals().serialize()

    # Handle retry and delta functionality
    if not result:
        result = self._handle_delta(name, version, project, schema, kwargs)

    if not result:
        # calculate chain if we already don't have result from checkpoint or delta
        result = self._evolve(
            query=self._query.save(
                name=name,
                version=version,
                project=project,
                description=description,
                attrs=attrs,
                feature_schema=schema,
                update_version=update_version,
                **kwargs,
            )
        )

        # Log checkpoint event for new dataset save
        assert result.version is not None
        full_dataset_name = create_dataset_full_name(
            namespace_name, project_name, name, result.version
        )
        catalog.metastore.log_checkpoint_event(
            job_id=self.job.id,
            event_type=CheckpointEventType.DATASET_SAVE_COMPLETED,
            step_type=CheckpointStepType.DATASET_SAVE,
            run_group_id=self.job.run_group_id,
            dataset_name=full_dataset_name,
            checkpoint_hash=_hash,
        )

    if self._checkpoints_enabled:
        catalog.metastore.get_or_create_checkpoint(self.job.id, _hash)
    return result

select

select(*args: str | Column, **kwargs) -> Self

Select only a specified set of signals.

Nested selections (e.g. "file.path") preserve the parent object by generating partial models rather than flattening into standalone fields.

Example

dc.select("file.path", "score")
dc.select("file.path", file_name=func.path.name(C("file.path")))

Source code in datachain/lib/dc/datachain.py

def select(self, *args: str | Column, **kwargs) -> "Self":
    """Select only a specified set of signals.

    Nested selections (e.g. ``"file.path"``) preserve the parent object by
    generating partial models rather than flattening into standalone fields.

    Example:
        ```py
        dc.select("file.path", "score")
        dc.select("file.path", file_name=func.path.name(C("file.path")))
        ```
    """
    args = self._signal_names(args, "select()", named_expressions=True)
    if kwargs:
        return self._mutate("select()", **kwargs).select(*args, *kwargs)
    if not args:
        return self
    new_schema = self.signals_schema.to_partial(*args)

    if "sys" in self.signals_schema.values and "sys" not in new_schema.values:
        new_schema = SignalSchema({"sys": self.signals_schema.values["sys"]}) | (
            new_schema
        )

    columns = new_schema.db_signals()
    return self._evolve(
        query=self._query.select(*columns),
        signal_schema=new_schema,
    )

select_except

select_except(*args: str | Column) -> Self

Select all signals except the specified ones.

Supports excluding nested fields (e.g. "file.path"), in which case a partial model is generated for the parent signal.

Example

dc.select_except("tmp_score")

Source code in datachain/lib/dc/datachain.py

def select_except(self, *args: str | Column) -> "Self":
    """Select all signals except the specified ones.

    Supports excluding nested fields (e.g. ``"file.path"``), in which case a
    partial model is generated for the parent signal.

    Example:
        ```py
        dc.select_except("tmp_score")
        ```
    """
    if not args:
        return self

    args = self._signal_names(args, "select_except()")
    new_schema = self.signals_schema.select_except_signals(*args)

    columns = new_schema.db_signals()
    return self._evolve(
        query=self._query.select(*columns),
        signal_schema=new_schema,
    )

settings

settings(
    cache: bool | None = None,
    prefetch: bool | int | None = None,
    parallel: bool | int | None = None,
    workers: int | None = None,
    namespace: str | None = None,
    project: str | None = None,
    min_task_size: int | None = None,
    batch_size: int | None = None,
    sys: bool | None = None,
    ephemeral: bool | None = None,
) -> Self

Set chain execution parameters. Returns the chain itself, allowing method chaining for subsequent operations. To restore all settings to their default values, use reset_settings().

Parameters:

cache (bool | None, default: None ) –

Enable files caching to speed up subsequent accesses to the same files from the same or different chains. Defaults to False.
prefetch (bool | int | None, default: None ) –

Enable prefetching of files. This will download files in advance in parallel. If an integer is provided, it specifies the number of files to prefetch concurrently for each process on each worker. Defaults to 2. Set to 0 or False to disable prefetching.
parallel (bool | int | None, default: None ) –

Number of processes to use for processing user-defined functions (UDFs) in parallel. If an integer is provided, it specifies the number of CPUs to use. If True, all available CPUs are used. Defaults to 1.
namespace (str | None, default: None ) –

Namespace to use for the chain by default.
project (str | None, default: None ) –

Project to use for the chain by default.
min_task_size (int | None, default: None ) –

Minimum number of rows per worker/process for parallel processing by UDFs. Defaults to 1.
batch_size (int | None, default: None ) –

Number of rows per insert by UDF to fine tune and balance speed and memory usage. This might be useful when processing large rows or when running into memory issues. Defaults to 2000.
ephemeral (bool | None, default: None ) –

If True, no persistent objects are created in the metastore (no jobs, checkpoints, or datasets). UDF execution still uses temporary tables. Calling .save() in ephemeral mode will raise an error.

Example

chain = (
    chain
    .settings(cache=True, parallel=8, batch_size=300)
    .map(laion=process_webdataset(spec=WDSLaion), params="file")
)

Source code in datachain/lib/dc/datachain.py

def settings(
    self,
    cache: bool | None = None,
    prefetch: bool | int | None = None,
    parallel: bool | int | None = None,
    workers: int | None = None,
    namespace: str | None = None,
    project: str | None = None,
    min_task_size: int | None = None,
    batch_size: int | None = None,
    sys: bool | None = None,
    ephemeral: bool | None = None,
) -> "Self":
    """Set chain execution parameters. Returns the chain itself, allowing method
    chaining for subsequent operations. To restore all settings to their default
    values, use `reset_settings()`.

    Parameters:
        cache: Enable files caching to speed up subsequent accesses to the same
            files from the same or different chains. Defaults to False.
        prefetch: Enable prefetching of files. This will download files in
            advance in parallel. If an integer is provided, it specifies the number
            of files to prefetch concurrently for each process on each worker.
            Defaults to 2. Set to 0 or False to disable prefetching.
        parallel: Number of processes to use for processing user-defined functions
            (UDFs) in parallel. If an integer is provided, it specifies the number
            of CPUs to use. If True, all available CPUs are used. Defaults to 1.
        namespace: Namespace to use for the chain by default.
        project: Project to use for the chain by default.
        min_task_size: Minimum number of rows per worker/process for parallel
            processing by UDFs. Defaults to 1.
        batch_size: Number of rows per insert by UDF to fine tune and balance speed
            and memory usage. This might be useful when processing large rows
            or when running into memory issues. Defaults to 2000.
        ephemeral: If True, no persistent objects are created in the metastore
            (no jobs, checkpoints, or datasets). UDF execution still uses
            temporary tables. Calling .save() in ephemeral mode will raise
            an error.

    Example:
        ```py
        chain = (
            chain
            .settings(cache=True, parallel=8, batch_size=300)
            .map(laion=process_webdataset(spec=WDSLaion), params="file")
        )
        ```
    """
    if sys is None:
        sys = self._sys
    settings = copy.copy(self._settings)
    settings.add(
        Settings(
            cache=cache,
            prefetch=prefetch,
            parallel=parallel,
            workers=workers,
            namespace=namespace,
            project=project,
            min_task_size=min_task_size,
            batch_size=batch_size,
            ephemeral=ephemeral,
        )
    )
    return self._evolve(settings=settings, _sys=sys)

setup

setup(**kwargs) -> Self

Setup variables to pass to UDF functions.

Use before running map/gen/agg to save an object and pass it as an argument to the UDF.

The value must be a callable (a lambda: <value> syntax can be used to quickly create one) that returns the object to be passed to the UDF. It is evaluated lazily when UDF is running, in case of multiple machines the callable is run on a worker machine.

Example

import anthropic
from anthropic.types import Message
import datachain as dc

(
    dc.read_storage(DATA, type="text")
    .settings(parallel=4, cache=True)

    # Setup Anthropic client and pass it to the UDF below automatically
    # The value is callable (see the note above)
    .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))

    .map(
        claude=lambda client, file: client.messages.create(
            model=MODEL,
            system=PROMPT,
            messages=[{"role": "user", "content": file.get_value()}],
        ),
        output=Message,
    )
)

Source code in datachain/lib/dc/datachain.py

def setup(self, **kwargs) -> "Self":
    """Setup variables to pass to UDF functions.

    Use before running map/gen/agg to save an object and pass it as an
    argument to the UDF.

    The value must be a callable (a `lambda: <value>` syntax can be used to quickly
    create one) that returns the object to be passed to the UDF. It is evaluated
    lazily when UDF is running, in case of multiple machines the callable is run on
    a worker machine.

    Example:
        ```py
        import anthropic
        from anthropic.types import Message
        import datachain as dc

        (
            dc.read_storage(DATA, type="text")
            .settings(parallel=4, cache=True)

            # Setup Anthropic client and pass it to the UDF below automatically
            # The value is callable (see the note above)
            .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))

            .map(
                claude=lambda client, file: client.messages.create(
                    model=MODEL,
                    system=PROMPT,
                    messages=[{"role": "user", "content": file.get_value()}],
                ),
                output=Message,
            )
        )
        ```
    """
    intersection = set(self._setup.keys()) & set(kwargs.keys())
    if intersection:
        keys = ", ".join(intersection)
        raise DatasetPrepareError(self.name, f"this value(s) already setup: {keys}")

    self._setup = self._setup | kwargs
    return self

show

show(
    limit: int = 20,
    flatten: bool = False,
    transpose: bool = False,
    truncate: bool = True,
    include_hidden: bool = False,
) -> None

Show a preview of the chain results.

Parameters:

limit (int, default: 20 ) –

How many rows to show.
flatten (bool, default: False ) –

Whether to use a multiindex or flatten column names.
transpose (bool, default: False ) –

Whether to transpose rows and columns.
truncate (bool, default: True ) –

Whether or not to truncate the contents of columns.
include_hidden (bool, default: False ) –

Whether to include hidden columns.

Source code in datachain/lib/dc/datachain.py

def show(
    self,
    limit: int = 20,
    flatten: bool = False,
    transpose: bool = False,
    truncate: bool = True,
    include_hidden: bool = False,
) -> None:
    """Show a preview of the chain results.

    Parameters:
        limit: How many rows to show.
        flatten: Whether to use a multiindex or flatten column names.
        transpose: Whether to transpose rows and columns.
        truncate: Whether or not to truncate the contents of columns.
        include_hidden: Whether to include hidden columns.
    """
    import pandas as pd

    dc = self.limit(limit) if limit > 0 else self  # type: ignore[misc]
    df = dc.to_pandas(
        flatten,
        include_hidden=include_hidden,
        as_object=True,
    )

    if df.empty:
        print("Empty result")
        print(f"Columns: {list(df.columns)}")
        return

    if transpose:
        df = df.T

    options: list = [
        "display.max_columns",
        None,
        "display.multi_sparse",
        False,
    ]

    try:
        if columns := os.get_terminal_size().columns:
            options.extend(["display.width", columns])
    except OSError:
        pass

    if not truncate:
        options.extend(["display.max_colwidth", None])

    with pd.option_context(*options):
        if inside_notebook():
            from IPython.display import display

            display(df)
        else:
            print(df)

    if len(df) == limit:
        print(f"\n[Limited by {len(df)} rows]")

shuffle

shuffle() -> Self

Shuffle rows with a best-effort deterministic ordering.

This produces repeatable shuffles. Merge and union operations can lead to non-deterministic results. Use order by or save a dataset afterward to guarantee the same result.

Source code in datachain/lib/dc/datachain.py

def shuffle(self) -> "Self":
    """Shuffle rows with a best-effort deterministic ordering.

    This produces repeatable shuffles. Merge and union operations can
    lead to non-deterministic results. Use order by or save a dataset
    afterward to guarantee the same result.
    """
    query = self._query.clone(new_table=False)
    query.steps.append(RegenerateSystemColumns(self._query.catalog))

    chain = self._evolve(
        query=query,
        signal_schema=SignalSchema({"sys": Sys}) | self.signals_schema,
    )
    return chain.order_by("sys.rand")

similarity_search

similarity_search(
    column: str,
    query: Sequence[float],
    *,
    k: int | None = 10,
    metric: str = "cosine",
    score_column: str | None = None
) -> Self

Return rows whose column embedding is closest to query.

Shortcut for .mutate(...).order_by(...).limit(k).

Parameters:

column (str) –

name of the embedding column on each row.
query (Sequence[float]) –

reference embedding (the vector to compare against).
k (int | None, default: 10 ) –

how many closest rows to return. None skips the limit and annotates/sorts every row.
metric (str, default: 'cosine' ) –

"cosine", "euclidean" or "l2" ("l2" is an alias for "euclidean").
score_column (str | None, default: None ) –

name to store the distance under. If None (default) the score is not included in the result.

Example

query = [0.1, 0.2, 0.3]
top5 = chain.similarity_search("emb", query, k=5)

with_score = chain.similarity_search(
    "emb", query, k=5, score_column="dist"
)

Source code in datachain/lib/dc/datachain.py

def similarity_search(
    self,
    column: str,
    query: Sequence[float],
    *,
    k: int | None = 10,
    metric: str = "cosine",
    score_column: str | None = None,
) -> "Self":
    """Return rows whose ``column`` embedding is closest to ``query``.

    Shortcut for ``.mutate(...).order_by(...).limit(k)``.

    Parameters:
        column: name of the embedding column on each row.
        query: reference embedding (the vector to compare against).
        k: how many closest rows to return. ``None`` skips the limit and
            annotates/sorts every row.
        metric: ``"cosine"``, ``"euclidean"`` or ``"l2"``
            (``"l2"`` is an alias for ``"euclidean"``).
        score_column: name to store the distance under. If ``None``
            (default) the score is not included in the result.

    Example:
        ```py
        query = [0.1, 0.2, 0.3]
        top5 = chain.similarity_search("emb", query, k=5)

        with_score = chain.similarity_search(
            "emb", query, k=5, score_column="dist"
        )
        ```
    """
    metric_funcs = {
        "cosine": cosine_distance,
        "euclidean": euclidean_distance,
        "l2": euclidean_distance,
    }
    if metric not in metric_funcs:
        raise ValueError(
            f"Unsupported metric '{metric}'. Choose one of: {sorted(metric_funcs)}"
        )

    col_name: str = score_column or SIMILARITY_SCORE_COL_NAME

    chain = self.mutate(
        **{col_name: metric_funcs[metric](column, list(query))}
    ).order_by(col_name)
    if k is not None:
        chain = chain.limit(k)
    if score_column is None:
        chain = chain.select_except(col_name)
    return chain

subtract

subtract(
    other: DataChain,
    on: str | Sequence[str] | None = None,
    right_on: str | Sequence[str] | None = None,
) -> Self

Remove rows that appear in another chain.

Parameters:

other (DataChain) –

chain whose rows will be removed from self
on (str | Sequence[str] | None, default: None ) –

columns to consider for determining row equality in self. If unspecified, defaults to all common columns between self and other.
right_on (str | Sequence[str] | None, default: None ) –

columns to consider for determining row equality in other. If unspecified, defaults to the same values as on.

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def subtract(  # type: ignore[override]
    self,
    other: "DataChain",
    on: str | Sequence[str] | None = None,
    right_on: str | Sequence[str] | None = None,
) -> "Self":
    """Remove rows that appear in another chain.

    Parameters:
        other: chain whose rows will be removed from `self`
        on: columns to consider for determining row equality in `self`.
            If unspecified, defaults to all common columns
            between `self` and `other`.
        right_on: columns to consider for determining row equality in `other`.
            If unspecified, defaults to the same values as `on`.
    """
    if isinstance(on, str):
        if not on:
            raise DataChainParamsError("'on' cannot be an empty string")
        on = [on]
    elif isinstance(on, Sequence):
        if not on or any(not col for col in on):
            raise DataChainParamsError("'on' cannot contain empty strings")

    if isinstance(right_on, str):
        if not right_on:
            raise DataChainParamsError("'right_on' cannot be an empty string")
        right_on = [right_on]
    elif isinstance(right_on, Sequence):
        if not right_on or any(not col for col in right_on):
            raise DataChainParamsError("'right_on' cannot contain empty strings")

    if on is None and right_on is None:
        other_columns = set(other._effective_signals_schema.db_signals())
        common_signals = [
            c
            for c in self._effective_signals_schema.db_signals()
            if c in other_columns
        ]
        if not common_signals:
            raise DataChainParamsError("subtract(): no common columns")
        signals = list(zip(common_signals, common_signals, strict=False))
    elif on is not None and right_on is None:
        right_on = on
        resolved_signals = list(self.signals_schema.resolve(*on).db_signals())
        signals = list(zip(resolved_signals, resolved_signals, strict=False))  # type: ignore[arg-type]
    elif on is None and right_on is not None:
        raise DataChainParamsError(
            "'on' must be specified when 'right_on' is provided"
        )
    else:
        if not isinstance(on, Sequence) or not isinstance(right_on, Sequence):
            raise TypeError(
                "'on' and 'right_on' must be 'str' or 'Sequence' object"
            )
        if len(on) != len(right_on):
            raise DataChainParamsError(
                "'on' and 'right_on' must have the same length"
            )
        signals = list(
            zip(
                self.signals_schema.resolve(*on).db_signals(),
                other.signals_schema.resolve(*right_on).db_signals(),
                strict=False,
            )  # type: ignore[arg-type]
        )
    return self._evolve(query=self._query.subtract(other._query, signals))  # type: ignore[arg-type]

sum

sum(col: str) -> StandardType

Compute the sum of a column.

Parameters:

col (str) –

The column to compute the sum for.

Returns:

StandardType –

The sum of the column values.

Example

total_size = chain.sum("file.size")
print(f"Total size: {total_size}")

Source code in datachain/lib/dc/datachain.py

def sum(self, col: str) -> StandardType:  # type: ignore[override]
    """Compute the sum of a column.

    Parameters:
        col: The column to compute the sum for.

    Returns:
        The sum of the column values.

    Example:
        ```py
        total_size = chain.sum("file.size")
        print(f"Total size: {total_size}")
        ```
    """
    return self._extend_to_data_model("sum", col)

to_columnar_data_with_names

to_columnar_data_with_names(
    chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
) -> tuple[list[str], Iterator[list[list[Any]]]]

Returns column names and the results as an iterator that provides chunks, with each chunk containing a list of columns, where each column contains a list of the row values for that column in that chunk. Useful for columnar data formats, such as parquet or other OLAP databases.

Source code in datachain/lib/dc/datachain.py

def to_columnar_data_with_names(
    self, chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE
) -> tuple[list[str], Iterator[list[list[Any]]]]:
    """Returns column names and the results as an iterator that provides chunks,
    with each chunk containing a list of columns, where each column contains a
    list of the row values for that column in that chunk. Useful for columnar data
    formats, such as parquet or other OLAP databases.
    """
    headers, _ = self._effective_signals_schema.get_headers_with_length()
    column_names = [".".join(filter(None, header)) for header in headers]

    results_iter = self._leaf_values()

    def column_chunks() -> Iterator[list[list[Any]]]:
        for chunk_iter in batched_it(results_iter, chunk_size):
            columns: list[list[Any]] = [[] for _ in column_names]
            for row in chunk_iter:
                for i, col in enumerate(columns):
                    col.append(row[i])
            yield columns

    return column_names, column_chunks()

to_csv

to_csv(
    path: str | PathLike[str],
    delimiter: str = ",",
    fs_kwargs: dict[str, Any] | None = None,
    **kwargs
) -> File

Save chain to a csv (comma-separated values) file and return the stored File.

Parameters:

path (str | PathLike[str]) –

Path to save the file. This supports local paths as well as remote paths, such as s3:// or hf:// with fsspec.
delimiter (str, default: ',' ) –

Delimiter to use for the resulting file.
fs_kwargs (dict[str, Any] | None, default: None ) –

Optional kwargs forwarded to the underlying fsspec filesystem when writing (e.g., s3://, gs://, hf://), fsspec-specific options are supported.

Returns:

File ( File ) –

The stored file with refreshed metadata (version, etag, size).

Source code in datachain/lib/dc/datachain.py

def to_csv(
    self,
    path: str | os.PathLike[str],
    delimiter: str = ",",
    fs_kwargs: dict[str, Any] | None = None,
    **kwargs,
) -> File:
    """Save chain to a csv (comma-separated values) file and return the stored
    `File`.

    Parameters:
        path: Path to save the file. This supports local paths as well as
            remote paths, such as s3:// or hf:// with fsspec.
        delimiter: Delimiter to use for the resulting file.
        fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
            when writing (e.g., s3://, gs://, hf://), fsspec-specific options
            are supported.

    Returns:
        File: The stored file with refreshed metadata (version, etag, size).
    """
    import csv

    target = File.at(path, session=self.session)

    headers, _ = self._effective_signals_schema.get_headers_with_length()
    column_names = [".".join(filter(None, header)) for header in headers]

    with target.open("w", newline="", client_config=fs_kwargs) as f:
        writer = csv.writer(f, delimiter=delimiter, **kwargs)
        writer.writerow(column_names)
        for row in self._leaf_values():
            writer.writerow(row)

    return target

to_database

to_database(
    table_name: str,
    connection: ConnectionType,
    *,
    batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
    on_conflict: str | None = None,
    conflict_columns: list[str] | None = None,
    column_mapping: dict[str, str | None] | None = None
) -> int

Save chain to a database table using a given database connection.

This method exports all DataChain records to a database table, creating the table if it doesn't exist and appending data if it does. The table schema is automatically inferred from the DataChain's signal schema.

For PostgreSQL, tables are created in the schema specified by the connection's search_path (defaults to 'public'). Use URL parameters to target specific schemas.

Parameters:

table_name (str) –

Name of the database table to create/write to.
connection (ConnectionType) –

SQLAlchemy connectable, str, or a sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible for engine disposal and connection closure for the SQLAlchemy connectable; str connections are closed automatically.
batch_size (int, default: DEFAULT_DATABASE_BATCH_SIZE ) –

Number of rows to insert per batch for optimal performance. Larger batches are faster but use more memory. Default: 10,000.
on_conflict (str | None, default: None ) –

Strategy for handling duplicate rows (requires table constraints): - None: Raise error (sqlalchemy.exc.IntegrityError) on conflict (default) - "ignore": Skip duplicate rows silently - "update": Update existing rows with new values
conflict_columns (list[str] | None, default: None ) –

List of column names that form a unique constraint for conflict resolution. Required when on_conflict='update' and using PostgreSQL.
column_mapping (dict[str, str | None] | None, default: None ) –

Optional mapping to rename or skip columns: - Dict mapping DataChain column names to database column names - Set values to None to skip columns entirely, or use defaultdict to skip all columns except those specified.

Returns:

int ( int ) –

Number of rows affected (inserted/updated). -1 if DB driver doesn't support telemetry.

Examples:

Basic usage with PostgreSQL:

import datachain as dc

rows_affected = (dc
  .read_storage("s3://my-bucket/")
  .to_database("files_table", "postgresql://user:pass@localhost/mydb")
)
print(f"Inserted/updated {rows_affected} rows")

Using SQLite with connection string:

rows_affected = chain.to_database("my_table", "sqlite:///data.db")
print(f"Affected {rows_affected} rows")

Column mapping and renaming:

mapping = {
    "user.id": "id",
    "user.name": "name",
    "user.password": None  # Skip this column
}
chain.to_database("users", engine, column_mapping=mapping)

Handling conflicts (requires PRIMARY KEY or UNIQUE constraints):

# Skip duplicates
chain.to_database("my_table", engine, on_conflict="ignore")

# Update existing records
chain.to_database(
   "my_table", engine, on_conflict="update", conflict_columns=["id"]
)

Working with different databases:

# MySQL
mysql_engine = sa.create_engine("mysql+pymysql://user:pass@host/db")
chain.to_database("mysql_table", mysql_engine)

# SQLite in-memory
chain.to_database("temp_table", "sqlite:///:memory:")

PostgreSQL with schema support:

pg_url = "postgresql://user:pass@host/db?options=-c search_path=analytics"
chain.to_database("processed_data", pg_url)

Source code in datachain/lib/dc/datachain.py

def to_database(
    self,
    table_name: str,
    connection: "ConnectionType",
    *,
    batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
    on_conflict: str | None = None,
    conflict_columns: list[str] | None = None,
    column_mapping: dict[str, str | None] | None = None,
) -> int:
    """Save chain to a database table using a given database connection.

    This method exports all DataChain records to a database table, creating the
    table if it doesn't exist and appending data if it does. The table schema
    is automatically inferred from the DataChain's signal schema.

    For PostgreSQL, tables are created in the schema specified by the connection's
    search_path (defaults to 'public'). Use URL parameters to target specific
    schemas.

    Parameters:
        table_name: Name of the database table to create/write to.
        connection: SQLAlchemy connectable, str, or a sqlite3 connection
            Using SQLAlchemy makes it possible to use any DB supported by that
            library. If a DBAPI2 object, only sqlite3 is supported. The user is
            responsible for engine disposal and connection closure for the
            SQLAlchemy connectable; str connections are closed automatically.
        batch_size: Number of rows to insert per batch for optimal performance.
            Larger batches are faster but use more memory. Default: 10,000.
        on_conflict: Strategy for handling duplicate rows (requires table
            constraints):
            - None: Raise error (`sqlalchemy.exc.IntegrityError`) on conflict
              (default)
            - "ignore": Skip duplicate rows silently
            - "update": Update existing rows with new values
        conflict_columns: List of column names that form a unique constraint
            for conflict resolution. Required when on_conflict='update' and
            using PostgreSQL.
        column_mapping: Optional mapping to rename or skip columns:
            - Dict mapping DataChain column names to database column names
            - Set values to None to skip columns entirely, or use `defaultdict` to
              skip all columns except those specified.

    Returns:
        int: Number of rows affected (inserted/updated). -1 if DB driver doesn't
             support telemetry.

    Examples:
        Basic usage with PostgreSQL:
        ```py
        import datachain as dc

        rows_affected = (dc
          .read_storage("s3://my-bucket/")
          .to_database("files_table", "postgresql://user:pass@localhost/mydb")
        )
        print(f"Inserted/updated {rows_affected} rows")
        ```

        Using SQLite with connection string:
        ```py
        rows_affected = chain.to_database("my_table", "sqlite:///data.db")
        print(f"Affected {rows_affected} rows")
        ```

        Column mapping and renaming:
        ```py
        mapping = {
            "user.id": "id",
            "user.name": "name",
            "user.password": None  # Skip this column
        }
        chain.to_database("users", engine, column_mapping=mapping)
        ```

        Handling conflicts (requires PRIMARY KEY or UNIQUE constraints):
        ```py
        # Skip duplicates
        chain.to_database("my_table", engine, on_conflict="ignore")

        # Update existing records
        chain.to_database(
           "my_table", engine, on_conflict="update", conflict_columns=["id"]
        )
        ```

        Working with different databases:
        ```py
        # MySQL
        mysql_engine = sa.create_engine("mysql+pymysql://user:pass@host/db")
        chain.to_database("mysql_table", mysql_engine)

        # SQLite in-memory
        chain.to_database("temp_table", "sqlite:///:memory:")
        ```

        PostgreSQL with schema support:
        ```py
        pg_url = "postgresql://user:pass@host/db?options=-c search_path=analytics"
        chain.to_database("processed_data", pg_url)
        ```
    """
    from .database import to_database

    return to_database(
        self,
        table_name,
        connection,
        batch_size=batch_size,
        on_conflict=on_conflict,
        conflict_columns=conflict_columns,
        column_mapping=column_mapping,
    )

to_iter

to_iter(
    *cols: str | Column,
) -> Generator[tuple[DataValue, ...], None, None]

Yields rows of values, optionally limited to the specified columns.

Parameters:

*cols (str | Column, default: () ) –

Limit to the specified columns. String names and plain C("...") column references are supported. By default, all columns are selected.

Yields:

tuple[DataType, ...] –

Yields a tuple of items for each row.

Example

Iterating over all rows:

for row in ds.to_iter():
    print(row)

DataChain is iterable and can be used in a for loop directly which is equivalent to ds.to_iter():

for row in ds:
    print(row)

Iterating over all rows with selected columns:

for name, size in ds.to_iter("file.path", "file.size"):
    print(name, size)

Iterating over a single column:

for (file,) in ds.to_iter("file.path"):
    print(file)

Source code in datachain/lib/dc/datachain.py

def to_iter(
    self, *cols: str | Column
) -> IteratorGenerator[tuple[DataValue, ...], None, None]:
    """Yields rows of values, optionally limited to the specified columns.

    Args:
        *cols: Limit to the specified columns. String names and plain
            ``C("...")`` column references are supported. By default, all
            columns are selected.

    Yields:
        (tuple[DataType, ...]): Yields a tuple of items for each row.

    Example:
        Iterating over all rows:
        ```py
        for row in ds.to_iter():
            print(row)
        ```

        DataChain is iterable and can be used in a for loop directly which is
        equivalent to `ds.to_iter()`:
        ```py
        for row in ds:
            print(row)
        ```

        Iterating over all rows with selected columns:
        ```py
        for name, size in ds.to_iter("file.path", "file.size"):
            print(name, size)
        ```

        Iterating over a single column:
        ```py
        for (file,) in ds.to_iter("file.path"):
            print(file)
        ```
    """
    cols = self._signal_names(cols, "to_iter()")
    signals_schema = (
        self.signals_schema.resolve(*cols)
        if cols
        else self._effective_signals_schema
    )
    db_signals = signals_schema.db_signals()
    with self._query.ordered_select(*db_signals).as_iterable() as rows:
        for row in rows:
            ret = signals_schema.row_to_features(
                row, catalog=self.session.catalog, cache=self._settings.cache
            )
            yield tuple(ret)

to_json

to_json(
    path: str | PathLike[str],
    fs_kwargs: dict[str, Any] | None = None,
    include_outer_list: bool = True,
) -> File

Save chain to a JSON file and return the stored File.

Parameters:

path (str | PathLike[str]) –

Path to save the file. This supports local paths as well as remote paths, such as s3:// or hf:// with fsspec.
fs_kwargs (dict[str, Any] | None, default: None ) –

Optional kwargs forwarded to the underlying fsspec filesystem when writing (e.g., s3://, gs://, hf://), fsspec-specific options are supported.
include_outer_list (bool, default: True ) –

Sets whether to include an outer list for all rows. Setting this to True makes the file valid JSON, while False instead writes in the JSON lines format.

Returns:

File ( File ) –

The stored file with refreshed metadata (version, etag, size).

Source code in datachain/lib/dc/datachain.py

def to_json(
    self,
    path: str | os.PathLike[str],
    fs_kwargs: dict[str, Any] | None = None,
    include_outer_list: bool = True,
) -> File:
    """Save chain to a JSON file and return the stored `File`.

    Parameters:
        path: Path to save the file. This supports local paths as well as
            remote paths, such as s3:// or hf:// with fsspec.
        fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
            when writing (e.g., s3://, gs://, hf://), fsspec-specific options
            are supported.
        include_outer_list: Sets whether to include an outer list for all rows.
            Setting this to True makes the file valid JSON, while False instead
            writes in the JSON lines format.

    Returns:
        File: The stored file with refreshed metadata (version, etag, size).
    """
    target = File.at(path, session=self.session)
    with target.open("wb", client_config=fs_kwargs) as f:
        self._write_json_stream(f, include_outer_list)
    return target

to_jsonl

to_jsonl(
    path: str | PathLike[str],
    fs_kwargs: dict[str, Any] | None = None,
) -> File

Save chain to a JSON lines file.

Parameters:

path (str | PathLike[str]) –

Path to save the file. This supports local paths as well as remote paths, such as s3:// or hf:// with fsspec.
fs_kwargs (dict[str, Any] | None, default: None ) –

Optional kwargs forwarded to the underlying fsspec filesystem when writing (e.g., s3://, gs://, hf://), fsspec-specific options are supported.

Returns:

File ( File ) –

The stored file with refreshed metadata (version, etag, size).

Source code in datachain/lib/dc/datachain.py

def to_jsonl(
    self,
    path: str | os.PathLike[str],
    fs_kwargs: dict[str, Any] | None = None,
) -> File:
    """Save chain to a JSON lines file.

    Parameters:
        path: Path to save the file. This supports local paths as well as
            remote paths, such as s3:// or hf:// with fsspec.
        fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
            when writing (e.g., s3://, gs://, hf://), fsspec-specific options
            are supported.

    Returns:
        File: The stored file with refreshed metadata (version, etag, size).
    """
    return self.to_json(path, fs_kwargs, include_outer_list=False)

to_list

to_list(*cols: str | Column) -> list[tuple[DataValue, ...]]

Returns a list of rows of values, optionally limited to the specified columns.

Parameters:

*cols (str | Column, default: () ) –

Limit to the specified columns. String names and plain C("...") column references are supported. By default, all columns are selected.

Returns:

list[tuple[DataValue, ...]] –

list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.

Example

Getting all rows as a list:

rows = dc.to_list()
print(rows)

Getting all rows with selected columns as a list:

name_size_pairs = dc.to_list("file.path", "file.size")
print(name_size_pairs)

Getting a single column as a list:

files = dc.to_list("file.path")
print(files)  # Returns list of 1-tuples

Source code in datachain/lib/dc/datachain.py

def to_list(self, *cols: str | Column) -> list[tuple[DataValue, ...]]:
    """Returns a list of rows of values, optionally limited to the specified
    columns.

    Parameters:
        *cols: Limit to the specified columns. String names and plain
            ``C("...")`` column references are supported. By default, all
            columns are selected.

    Returns:
        list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.

    Example:
        Getting all rows as a list:
        ```py
        rows = dc.to_list()
        print(rows)
        ```

        Getting all rows with selected columns as a list:
        ```py
        name_size_pairs = dc.to_list("file.path", "file.size")
        print(name_size_pairs)
        ```

        Getting a single column as a list:
        ```py
        files = dc.to_list("file.path")
        print(files)  # Returns list of 1-tuples
        ```
    """
    return list(self.to_iter(*self._signal_names(cols, "to_list()")))

to_pandas

to_pandas(
    flatten: bool = False,
    include_hidden: bool = True,
    as_object: bool = False,
) -> DataFrame

Return a pandas DataFrame from the chain.

Parameters:

flatten (bool, default: False ) –

Whether to use a multiindex or flatten column names.
include_hidden (bool, default: True ) –

Whether to include hidden columns.
as_object (bool, default: False ) –

Whether to emit a dataframe backed by Python objects rather than pandas-inferred dtypes.

Returns:

DataFrame –

pd.DataFrame: A pandas DataFrame representation of the chain.

Source code in datachain/lib/dc/datachain.py

def to_pandas(
    self,
    flatten: bool = False,
    include_hidden: bool = True,
    as_object: bool = False,
) -> "pd.DataFrame":
    """Return a pandas DataFrame from the chain.

    Parameters:
        flatten: Whether to use a multiindex or flatten column names.
        include_hidden: Whether to include hidden columns.
        as_object: Whether to emit a dataframe backed by Python objects
            rather than pandas-inferred dtypes.

    Returns:
        pd.DataFrame: A pandas DataFrame representation of the chain.
    """
    import pandas as pd

    headers, max_length = self._effective_signals_schema.get_headers_with_length(
        include_hidden=include_hidden
    )

    columns: list[str] | pd.MultiIndex
    if flatten or max_length < 2:
        columns = [".".join(filter(None, header)) for header in headers]
    else:
        columns = pd.MultiIndex.from_tuples(map(tuple, headers))

    results = self.results(include_hidden=include_hidden)
    if as_object:
        df = pd.DataFrame(results, columns=columns, dtype=object)
        df.where(pd.notna(df), None, inplace=True)
        return df
    return pd.DataFrame.from_records(results, columns=columns)

to_parquet

to_parquet(
    path: str | PathLike[str] | BinaryIO,
    partition_cols: Sequence[str] | None = None,
    chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
    fs_kwargs: dict[str, Any] | None = None,
    **kwargs
) -> None

Save chain to parquet file with SignalSchema metadata.

Parameters:

path (str | PathLike[str] | BinaryIO) –

Path or a file-like binary object to save the file. This supports local paths as well as remote paths, such as s3:// or hf:// with fsspec.
partition_cols (Sequence[str] | None, default: None ) –

Column names by which to partition the dataset.
chunk_size (int, default: DEFAULT_PARQUET_CHUNK_SIZE ) –

The chunk size of results to read and convert to columnar data, to avoid running out of memory.
fs_kwargs (dict[str, Any] | None, default: None ) –

Optional kwargs forwarded to the underlying fsspec filesystem when writing (e.g., s3://, gs://, hf://), fsspec-specific options are supported.

Source code in datachain/lib/dc/datachain.py

def to_parquet(
    self,
    path: str | os.PathLike[str] | BinaryIO,
    partition_cols: Sequence[str] | None = None,
    chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
    fs_kwargs: dict[str, Any] | None = None,
    **kwargs,
) -> None:
    """Save chain to parquet file with SignalSchema metadata.

    Parameters:
        path: Path or a file-like binary object to save the file. This supports
            local paths as well as remote paths, such as s3:// or hf:// with fsspec.
        partition_cols: Column names by which to partition the dataset.
        chunk_size: The chunk size of results to read and convert to columnar
            data, to avoid running out of memory.
        fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
            when writing (e.g., s3://, gs://, hf://), fsspec-specific options
            are supported.
    """
    import pyarrow as pa
    import pyarrow.parquet as pq

    from datachain.lib.arrow import DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY

    fsspec_fs = None

    if isinstance(path, str) and "://" in path:
        from datachain.client.fsspec import Client

        fs_kwargs = {
            **self._query.catalog.client_config,
            **(fs_kwargs or {}),
        }

        client = Client.get_implementation(path)

        if path.startswith("file://"):
            # pyarrow does not handle file:// uris, and needs a direct path instead.
            from urllib.parse import urlparse

            path = urlparse(path).path
            if sys.platform == "win32":
                path = os.path.normpath(path.lstrip("/"))

        fsspec_fs = client.create_fs(**fs_kwargs)

    _partition_cols = list(partition_cols) if partition_cols else None
    signal_schema_metadata = json.dumps(
        self._effective_signals_schema.serialize(), ensure_ascii=False
    ).encode("utf-8")

    column_names, column_chunks = self.to_columnar_data_with_names(chunk_size)

    parquet_schema = None
    parquet_writer = None
    first_chunk = True

    for chunk in column_chunks:
        # pyarrow infers the best parquet schema from the python types of
        # the input data.
        table = pa.Table.from_pydict(
            dict(zip(column_names, chunk, strict=False)),
            schema=parquet_schema,
        )

        # Preserve any existing metadata, and add the DataChain SignalSchema.
        existing_metadata = table.schema.metadata or {}
        merged_metadata = {
            **existing_metadata,
            DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY: signal_schema_metadata,
        }
        table = table.replace_schema_metadata(merged_metadata)
        parquet_schema = table.schema

        if _partition_cols:
            # Write to a partitioned parquet dataset.
            pq.write_to_dataset(
                table,
                root_path=path,
                partition_cols=_partition_cols,
                filesystem=fsspec_fs,
                **kwargs,
            )
        else:
            if first_chunk:
                # Write to a single parquet file.
                parquet_writer = pq.ParquetWriter(
                    path, parquet_schema, filesystem=fsspec_fs, **kwargs
                )
                first_chunk = False

            assert parquet_writer
            parquet_writer.write_table(table)

    if parquet_writer:
        parquet_writer.close()

to_pytorch

to_pytorch(
    transform=None,
    tokenizer=None,
    tokenizer_kwargs=None,
    num_samples=0,
    remove_prefetched: bool = False,
)

Convert to pytorch dataset format.

Parameters:

transform (Transform, default: None ) –

Torchvision transforms to apply to the dataset.
tokenizer (Callable, default: None ) –

Tokenizer to use to tokenize text values.
tokenizer_kwargs (dict, default: None ) –

Additional kwargs to pass when calling tokenizer.
num_samples (int, default: 0 ) –

Number of random samples to draw for each epoch. This argument is ignored if num_samples=0 (the default).
remove_prefetched (bool, default: False ) –

Whether to remove prefetched files after reading.

Example

from torch.utils.data import DataLoader
loader = DataLoader(
    chain.select("file", "label").to_pytorch(),
    batch_size=16
)

Source code in datachain/lib/dc/datachain.py

def to_pytorch(
    self,
    transform=None,
    tokenizer=None,
    tokenizer_kwargs=None,
    num_samples=0,
    remove_prefetched: bool = False,
):
    """Convert to pytorch dataset format.

    Args:
        transform (Transform): Torchvision transforms to apply to the dataset.
        tokenizer (Callable): Tokenizer to use to tokenize text values.
        tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
        num_samples (int): Number of random samples to draw for each epoch.
            This argument is ignored if `num_samples=0` (the default).
        remove_prefetched (bool): Whether to remove prefetched files after reading.

    Example:
        ```py
        from torch.utils.data import DataLoader
        loader = DataLoader(
            chain.select("file", "label").to_pytorch(),
            batch_size=16
        )
        ```
    """
    from datachain.torch import PytorchDataset

    if self._query.attached:
        chain = self
    else:
        chain = self.persist()
    assert chain.name is not None  # for mypy
    return PytorchDataset(
        chain.name,
        chain.version,
        catalog=self.session.catalog,
        transform=transform,
        tokenizer=tokenizer,
        tokenizer_kwargs=tokenizer_kwargs,
        num_samples=num_samples,
        dc_settings=chain._settings,
        remove_prefetched=remove_prefetched,
    )

to_records

to_records() -> list[dict[str, Any]]

Convert every row to a dictionary.

Source code in datachain/lib/dc/datachain.py

def to_records(self) -> list[dict[str, Any]]:
    """Convert every row to a dictionary."""

    def to_dict(cols: list[str], row: tuple[Any, ...]) -> dict[str, Any]:
        return dict(zip(cols, row, strict=False))

    return self.results(row_factory=to_dict)

to_storage

to_storage(
    output: str | PathLike[str],
    signal: str = "file",
    placement: ExportPlacement = "fullpath",
    link_type: Literal["copy", "symlink"] = "copy",
    num_threads: int | None = EXPORT_FILES_MAX_THREADS,
    anon: bool | None = None,
    client_config: dict | None = None,
) -> None

Export files from a specified signal to a directory. Files can be exported to a local or cloud directory.

Parameters:

output (str | PathLike[str]) –

Path to the target directory for exporting files.
signal (str, default: 'file' ) –

Name of the signal to export files from.
placement (ExportPlacement, default: 'fullpath' ) –
The method to use for naming exported files. The possible values are: "filename", "etag", "fullpath", "filepath", and "checksum". Example path translations for an object located at s3://bucket/data/img.jpg and exported to ./out:
- "filename" -> ./out/img.jpg (no directories)
- "filepath" -> ./out/data/img.jpg (relative path kept)
- "fullpath" -> ./out/bucket/data/img.jpg (remote host kept)
- "etag" -> ./out/<etag>.jpg (unique name via object digest)
Local sources behave like "filepath" for "fullpath" placement. Relative destinations such as "." or ".." and absolute paths are supported for every strategy.
link_type (Literal['copy', 'symlink'], default: 'copy' ) –

Method to use for exporting files. Falls back to 'copy' if symlinking fails.
num_threads (int | None, default: EXPORT_FILES_MAX_THREADS ) –

number of threads to use for exporting files. By default, it uses 5 threads.
anon (bool | None, default: None ) –

If True, we will treat cloud bucket as a public one. Default behavior depends on the previous session configuration (e.g. happens in the initial read_storage) and particular cloud storage client implementation (e.g. S3 fallbacks to anonymous access if no credentials were found).
client_config (dict | None, default: None ) –

Optional configuration for the destination storage client

Example

Cross cloud transfer

import datachain as dc

ds = dc.read_storage("s3://mybucket")
ds.to_storage("gs://mybucket", placement="filename")

Source code in datachain/lib/dc/datachain.py

def to_storage(
    self,
    output: str | os.PathLike[str],
    signal: str = "file",
    placement: FileExportPlacement = "fullpath",
    link_type: Literal["copy", "symlink"] = "copy",
    num_threads: int | None = EXPORT_FILES_MAX_THREADS,
    anon: bool | None = None,
    client_config: dict | None = None,
) -> None:
    """Export files from a specified signal to a directory. Files can be
    exported to a local or cloud directory.

    Args:
        output: Path to the target directory for exporting files.
        signal: Name of the signal to export files from.
        placement: The method to use for naming exported files.
            The possible values are: "filename", "etag", "fullpath",
            "filepath", and "checksum".
            Example path translations for an object located at
            ``s3://bucket/data/img.jpg`` and exported to ``./out``:

            - "filename" -> ``./out/img.jpg`` (no directories)
            - "filepath" -> ``./out/data/img.jpg`` (relative path kept)
            - "fullpath" -> ``./out/bucket/data/img.jpg`` (remote host kept)
            - "etag" -> ``./out/<etag>.jpg`` (unique name via object digest)

            Local sources behave like "filepath" for "fullpath" placement.
            Relative destinations such as "." or ".." and absolute paths
            are supported for every strategy.
        link_type: Method to use for exporting files.
            Falls back to `'copy'` if symlinking fails.
        num_threads: number of threads to use for exporting files.
            By default, it uses 5 threads.
        anon: If True, we will treat cloud bucket as a public one. Default behavior
            depends on the previous session configuration (e.g. happens in the
            initial `read_storage`) and particular cloud storage client
            implementation (e.g. S3 fallbacks to anonymous access if no credentials
            were found).
        client_config: Optional configuration for the destination storage client

    Example:
        Cross cloud transfer
        ```py
        import datachain as dc

        ds = dc.read_storage("s3://mybucket")
        ds.to_storage("gs://mybucket", placement="filename")
        ```
    """
    chain = self.persist()
    count = chain.count()

    if placement == "filename" and (
        chain._query.distinct(pathfunc.name(C(f"{signal}__path"))).count() != count
    ):
        raise ValueError("Files with the same name found")

    if anon is not None:
        client_config = (client_config or {}) | {"anon": anon}

    progress_bar = tqdm(
        desc=f"Exporting files to {output}: ",
        unit=" files",
        unit_scale=True,
        unit_divisor=10,
        total=count,
        leave=False,
    )
    file_exporter = FileExporter(
        output,
        placement,
        self._settings.cache if self._settings else False,
        link_type,
        max_threads=num_threads or 1,
        client_config=client_config,
    )
    with closing(chain.to_iter(signal)) as rows_iter:
        file_exporter.run(
            (rows[0] for rows in rows_iter),
            progress_bar,
        )

to_values

to_values(col: str | Column) -> list[DataValue]

Returns a flat list of values from a single column.

Parameters:

col (str | Column) –

The column to extract values from. String names and plain C("...") column references are supported.

Returns:

list[DataValue] –

list[DataValue]: Returns a flat list of values from the specified column.

Example

Getting all values from a single column:

file_paths = dc.to_values("file.path")
print(file_paths)  # Returns list of strings

Getting all file sizes:

sizes = dc.to_values("file.size")
print(sizes)  # Returns list of integers

Source code in datachain/lib/dc/datachain.py

def to_values(self, col: str | Column) -> list[DataValue]:
    """Returns a flat list of values from a single column.

    Parameters:
        col: The column to extract values from. String names and plain
            ``C("...")`` column references are supported.

    Returns:
        list[DataValue]: Returns a flat list of values from the specified column.

    Example:
        Getting all values from a single column:
        ```py
        file_paths = dc.to_values("file.path")
        print(file_paths)  # Returns list of strings
        ```

        Getting all file sizes:
        ```py
        sizes = dc.to_values("file.size")
        print(sizes)  # Returns list of integers
        ```
    """
    return [
        row[0] for row in self.to_list(*self._signal_names((col,), "to_values()"))
    ]

union

union(other: Self) -> Self

Return the set union of the two datasets.

Parameters:

other (Self) –

chain whose rows will be added to self.

Source code in datachain/lib/dc/datachain.py

@delta_disabled
def union(self, other: "Self") -> "Self":
    """Return the set union of the two datasets.

    Parameters:
        other: chain whose rows will be added to `self`.
    """
    left, right = self._align_optional_for_union(other)
    self_schema = left.signals_schema
    other_schema = right.signals_schema
    missing_left, missing_right = self_schema.compare_signals(other_schema)
    if missing_left or missing_right:
        raise UnionSchemaMismatchError.from_column_sets(
            missing_left,
            missing_right,
        )

    # Evolve, don't mutate `left` in place — it may be the caller's own chain.
    return left._evolve(
        query=left._query.union(right._query),
        signal_schema=self_schema.clone_without_sys_signals(),
    )

DataChainError

Bases: Exception

Session

Session(
    name="",
    catalog: Catalog | None = None,
    client_config: dict | None = None,
    in_memory: bool = False,
)

Session is a context that keeps track of temporary DataChain datasets for a proper cleanup. By default, a global session is created.

Temporary or ephemeral datasets are the ones created without specified name. They are useful for optimization purposes and should be automatically removed.

Temp dataset has specific name format

"session_"

The suffix is optional. Both s are auto-generated.

Temp dataset examples

session_myname_624b41_48e8b4 session_4b962d_2a5dff

Parameters:

name (str): The name of the session. Only latters and numbers are supported. It can be empty. catalog (Catalog): Catalog object.

Source code in datachain/query/session.py

def __init__(
    self,
    name="",
    catalog: "Catalog | None" = None,
    client_config: dict | None = None,
    in_memory: bool = False,
):
    if re.match(r"^[0-9a-zA-Z]*$", name) is None:
        raise ValueError(
            f"Session name can contain only letters or numbers - '{name}' given."
        )

    if not name:
        name = self.GLOBAL_SESSION_NAME

    session_uuid = uuid4().hex[: self.SESSION_UUID_LEN]
    self.name = f"{name}_{session_uuid}"
    self.is_new_catalog = not catalog
    self.catalog = catalog or get_catalog(
        client_config=client_config, in_memory=in_memory
    )
    Session._ALL_SESSIONS.add(self)

get `classmethod`

get(
    session: Session | None = None,
    catalog: Catalog | None = None,
    client_config: dict | None = None,
    in_memory: bool = False,
) -> Session

Creates a Session() object from a catalog.

Parameters:

session (Session, default: None ) –

Optional Session(). If not provided a new session will be created. It's needed mostly for simple API purposes.
catalog (Catalog, default: None ) –

Optional catalog. By default, a new catalog is created.

Source code in datachain/query/session.py

@classmethod
def get(
    cls,
    session: "Session | None" = None,
    catalog: "Catalog | None" = None,
    client_config: dict | None = None,
    in_memory: bool = False,
) -> "Session":
    """Creates a Session() object from a catalog.

    Parameters:
        session (Session): Optional Session(). If not provided a new session will
                be created. It's needed mostly for simple API purposes.
        catalog (Catalog): Optional catalog. By default, a new catalog is created.
    """
    if session:
        return session

    # Access the active (most recent) context from the stack
    if cls.SESSION_CONTEXTS:
        session = cls.SESSION_CONTEXTS[-1]

    elif cls.GLOBAL_SESSION_CTX is None:
        cls.GLOBAL_SESSION_CTX = Session(
            cls.GLOBAL_SESSION_NAME,
            catalog,
            client_config=client_config,
            in_memory=in_memory,
        )
        session = cls.GLOBAL_SESSION_CTX

        atexit.register(cls._global_cleanup)
        cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
        sys.excepthook = cls.except_hook
    else:
        session = cls.GLOBAL_SESSION_CTX

    if client_config and session.catalog.client_config != client_config:
        session = Session(
            "session" + uuid4().hex[:4],
            catalog,
            client_config=client_config,
            in_memory=in_memory,
        )
        session.__enter__()

    return session

get_job

get_job() -> Job | None

Return the current job if one exists, without creating a new one.

Checks the cached _CURRENT_JOB and DATACHAIN_JOB_ID env var. Returns None if no job is found.

Source code in datachain/query/session.py

def get_job(self) -> "Job | None":
    """
    Return the current job if one exists, without creating a new one.

    Checks the cached ``_CURRENT_JOB`` and ``DATACHAIN_JOB_ID`` env var.
    Returns None if no job is found.
    """
    if Session._CURRENT_JOB:
        return Session._CURRENT_JOB

    if env_job_id := os.getenv("DATACHAIN_JOB_ID"):
        Session._CURRENT_JOB = self.catalog.metastore.get_job(env_job_id)
        if Session._CURRENT_JOB:
            Session._OWNS_JOB = False
        return Session._CURRENT_JOB

    return None

get_or_create_job

get_or_create_job() -> Job

Get or create a Job for this process.

Returns:

Job ( Job ) –

The active Job instance.

Behavior

If a job already exists, it is returned.
If in Studio without DATACHAIN_JOB_ID, raises an error.
If DATACHAIN_JOB_ID is set, the corresponding job is fetched.
Otherwise, a new job is created:
- Name = absolute path to the Python script.
- Query = empty string.
- Parent = last job with the same name, if available.
- Status = "running". Exit hooks are registered to finalize the job.

Note

Job is shared across all Session instances to ensure one job per process.

Source code in datachain/query/session.py

def get_or_create_job(self) -> "Job":
    """
    Get or create a Job for this process.

    Returns:
        Job: The active Job instance.

    Behavior:
        - If a job already exists, it is returned.
        - If in Studio without DATACHAIN_JOB_ID, raises an error.
        - If ``DATACHAIN_JOB_ID`` is set, the corresponding job is fetched.
        - Otherwise, a new job is created:
            * Name = absolute path to the Python script.
            * Query = empty string.
            * Parent = last job with the same name, if available.
            * Status = "running".
          Exit hooks are registered to finalize the job.

    Note:
        Job is shared across all Session instances to ensure one job per process.
    """
    if Session._CURRENT_JOB:
        return Session._CURRENT_JOB

    from datachain.lib.dc.utils import is_studio

    if env_job_id := os.getenv("DATACHAIN_JOB_ID"):
        # SaaS run: just fetch existing job
        Session._CURRENT_JOB = self.catalog.metastore.get_job(env_job_id)
        if not Session._CURRENT_JOB:
            raise JobNotFoundError(
                f"Job {env_job_id} from DATACHAIN_JOB_ID env not found"
            )
        Session._OWNS_JOB = False
    elif is_studio():
        raise DataChainError(
            "Cannot create job in Studio without DATACHAIN_JOB_ID. "
            "This usually means an internal operation is missing "
            "ephemeral mode."
        )
    else:
        # Local run: create new job
        query = ""
        if is_script_run():
            script = os.path.abspath(sys.argv[0])
            try:
                with open(script) as f:
                    query = f.read()
            except OSError:
                pass
        else:
            # Interactive session or module run - use unique name to avoid
            # linking unrelated sessions
            script = str(uuid4())
        python_version = f"{sys.version_info.major}.{sys.version_info.minor}"

        # try to find the parent job for checkpoint/rerun chain
        parent = self.catalog.metastore.get_last_job_by_name(script)

        job_id = self.catalog.metastore.create_job(
            name=script,
            query=query,
            query_type=JobQueryType.PYTHON,
            status=JobStatus.RUNNING,
            python_version=python_version,
            rerun_from_job_id=parent.id if parent else None,
            run_group_id=parent.run_group_id if parent else None,
        )
        Session._CURRENT_JOB = self.catalog.metastore.get_job(job_id)
        Session._OWNS_JOB = True
        Session._JOB_STATUS = JobStatus.RUNNING

        # register cleanup hooks only once
        if not Session._JOB_HOOKS_REGISTERED:

            def _finalize_success_hook() -> None:
                self._finalize_job_success()

            Session._JOB_FINALIZE_HOOK = _finalize_success_hook
            atexit.register(Session._JOB_FINALIZE_HOOK)
            Session._JOB_HOOKS_REGISTERED = True

    assert Session._CURRENT_JOB is not None
    return Session._CURRENT_JOB

Sys

Bases: DataModel

Model for internal DataChain signals id and rand.

DataChain

C module-attribute

ConnectionType module-attribute

listings

read_csv

read_dataset

read_hf

read_json

read_pandas

read_parquet

read_records

read_storage

read_zarr

read_values

read_database

datasets

delete_dataset

move_dataset

delete_namespace

is_studio

is_local

Column

glob

regexp

ColumnExpr

DataChainSchema

__str__

flatten

to_string

DataChain

dataset property

delta property

delta_unsafe property

empty property

job property

name property

namespace_name property

project_name property

schema property

session property

version property

__iter__

__or__

__repr__

agg

apply

avg

c

chunk

clone

column

count

diff

distinct

exec

explode

file_diff

filter

gen

group_by

limit

map

max

merge

min

mutate

offset

order_by

parse_tabular

persist

print_schema

reset_settings

results

sample

save

select

select_except

settings

setup

show

C `module-attribute`

ConnectionType `module-attribute`

str

dataset `property`

delta `property`

delta_unsafe `property`

empty `property`

job `property`

name `property`

namespace_name `property`

project_name `property`

schema `property`

session `property`

version `property`

iter

or

repr

get `classmethod`