Source code for feast.infra.offline_stores.contrib.mssql_offline_store.mssql

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import warnings
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union

import numpy as np
import pandas
import pyarrow
import pyarrow as pa
import sqlalchemy
from pydantic.types import StrictStr
from pydantic.typing import Literal
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
from sqlalchemy.orm import sessionmaker

from feast import FileSource, errors
from feast.data_source import DataSource
from feast.errors import InvalidEntityType
from feast.feature_logging import LoggingConfig, LoggingSource
from feast.feature_view import FeatureView
from feast.infra.offline_stores import offline_utils
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalMetadata
from feast.infra.offline_stores.offline_utils import (
    DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL,
    build_point_in_time_query,
    get_feature_view_query_context,
)
from feast.infra.provider import RetrievalJob
from feast.infra.registry.base_registry import BaseRegistry
from feast.on_demand_feature_view import OnDemandFeatureView
from feast.repo_config import FeastBaseModel, RepoConfig
from feast.saved_dataset import SavedDatasetStorage
from feast.type_map import pa_to_mssql_type
from feast.usage import log_exceptions_and_usage

# Make sure warning doesn't raise more than once.
warnings.simplefilter("once", RuntimeWarning)

EntitySchema = Dict[str, np.dtype]


[docs]class MsSqlServerOfflineStoreConfig(FeastBaseModel):
    """Offline store config for SQL Server"""

    type: Literal["mssql"] = "mssql"
    """ Offline store type selector"""

    connection_string: StrictStr = "mssql+pyodbc://sa:yourStrong(!)Password@localhost:1433/feast_test?driver=ODBC+Driver+17+for+SQL+Server"
    """Connection string containing the host, port, and configuration parameters for SQL Server
     format: SQLAlchemy connection string, e.g. mssql+pyodbc://sa:yourStrong(!)Password@localhost:1433/feast_test?driver=ODBC+Driver+17+for+SQL+Server"""


[docs]def make_engine(config: MsSqlServerOfflineStoreConfig) -> Engine:
    return create_engine(config.connection_string)


[docs]class MsSqlServerOfflineStore(OfflineStore):
    """
    Microsoft SQL Server based offline store, supporting Azure Synapse or Azure SQL.

    Note: to use this, you'll need to have Microsoft ODBC 17 installed.
    See https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15#17
    """

[docs]    @staticmethod
    @log_exceptions_and_usage(offline_store="mssql")
    def pull_latest_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        timestamp_field: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        warnings.warn(
            "The Azure Synapse + Azure SQL offline store is an experimental feature in alpha development. "
            "Some functionality may still be unstable so functionality can change in the future.",
            RuntimeWarning,
        )
        assert type(data_source).__name__ == "MsSqlServerSource"
        from_expression = data_source.get_table_query_string().replace("`", "")

        partition_by_join_key_string = ", ".join(join_key_columns)
        if partition_by_join_key_string != "":
            partition_by_join_key_string = (
                "PARTITION BY " + partition_by_join_key_string
            )
        timestamps = [timestamp_field]
        if created_timestamp_column:
            timestamps.append(created_timestamp_column)
        timestamp_desc_string = " DESC, ".join(timestamps) + " DESC"
        field_string = ", ".join(join_key_columns + feature_name_columns + timestamps)

        query = f"""
            SELECT {field_string}
            FROM (
                SELECT {field_string},
                ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row
                FROM {from_expression} inner_t
                WHERE {timestamp_field} BETWEEN CONVERT(DATETIMEOFFSET, '{start_date}', 120) AND CONVERT(DATETIMEOFFSET, '{end_date}', 120)
            ) outer_t
            WHERE outer_t._feast_row = 1
            """
        engine = make_engine(config.offline_store)

        return MsSqlServerRetrievalJob(
            query=query,
            engine=engine,
            config=config.offline_store,
            full_feature_names=False,
            on_demand_feature_views=None,
        )

[docs]    @staticmethod
    @log_exceptions_and_usage(offline_store="mssql")
    def pull_all_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        timestamp_field: str,
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert type(data_source).__name__ == "MsSqlServerSource"
        warnings.warn(
            "The Azure Synapse + Azure SQL offline store is an experimental feature in alpha development. "
            "Some functionality may still be unstable so functionality can change in the future.",
            RuntimeWarning,
        )
        from_expression = data_source.get_table_query_string().replace("`", "")
        timestamps = [timestamp_field]
        field_string = ", ".join(join_key_columns + feature_name_columns + timestamps)

        query = f"""
            SELECT {field_string}
            FROM (
                SELECT {field_string}
                FROM {from_expression}
                WHERE {timestamp_field} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}'
            )
            """
        engine = make_engine(config.offline_store)

        return MsSqlServerRetrievalJob(
            query=query,
            engine=engine,
            config=config.offline_store,
            full_feature_names=False,
            on_demand_feature_views=None,
        )

[docs]    @staticmethod
    @log_exceptions_and_usage(offline_store="mssql")
    def get_historical_features(
        config: RepoConfig,
        feature_views: List[FeatureView],
        feature_refs: List[str],
        entity_df: Union[pandas.DataFrame, str],
        registry: BaseRegistry,
        project: str,
        full_feature_names: bool = False,
    ) -> RetrievalJob:
        warnings.warn(
            "The Azure Synapse + Azure SQL offline store is an experimental feature in alpha development. "
            "Some functionality may still be unstable so functionality can change in the future.",
            RuntimeWarning,
        )

        expected_join_keys = _get_join_keys(project, feature_views, registry)
        assert isinstance(config.offline_store, MsSqlServerOfflineStoreConfig)
        engine = make_engine(config.offline_store)
        if isinstance(entity_df, pandas.DataFrame):
            entity_df_event_timestamp_col = (
                offline_utils.infer_event_timestamp_from_entity_df(
                    dict(zip(list(entity_df.columns), list(entity_df.dtypes)))
                )
            )
            entity_df[entity_df_event_timestamp_col] = pandas.to_datetime(
                entity_df[entity_df_event_timestamp_col], utc=True
            ).fillna(pandas.Timestamp.now())

        elif isinstance(entity_df, str):
            raise ValueError(
                "string entities are currently not supported in the MsSQL offline store."
            )
        (
            table_schema,
            table_name,
        ) = _upload_entity_df_into_sqlserver_and_get_entity_schema(
            engine, config, entity_df, full_feature_names=full_feature_names
        )

        _assert_expected_columns_in_sqlserver(
            expected_join_keys,
            entity_df_event_timestamp_col,
            table_schema,
        )

        entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range(
            entity_df,
            entity_df_event_timestamp_col,
            engine,
        )

        # Build a query context containing all information required to template the SQL query
        query_context = get_feature_view_query_context(
            feature_refs,
            feature_views,
            registry,
            project,
            entity_df_timestamp_range=entity_df_event_timestamp_range,
        )

        # Generate the SQL query from the query context
        query = build_point_in_time_query(
            query_context,
            left_table_query_string=table_name,
            entity_df_event_timestamp_col=entity_df_event_timestamp_col,
            entity_df_columns=table_schema.keys(),
            full_feature_names=full_feature_names,
            query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN,
        )
        query = query.replace("`", "")

        job = MsSqlServerRetrievalJob(
            query=query,
            engine=engine,
            config=config.offline_store,
            full_feature_names=full_feature_names,
            on_demand_feature_views=registry.list_on_demand_feature_views(project),
        )
        return job

[docs]    @staticmethod
    def write_logged_features(
        config: RepoConfig,
        data: Union[pyarrow.Table, Path],
        source: LoggingSource,
        logging_config: LoggingConfig,
        registry: BaseRegistry,
    ):
        raise NotImplementedError()

[docs]    @staticmethod
    def offline_write_batch(
        config: RepoConfig,
        feature_view: FeatureView,
        table: pyarrow.Table,
        progress: Optional[Callable[[int], Any]],
    ):
        raise NotImplementedError()


def _assert_expected_columns_in_dataframe(
    join_keys: Set[str], entity_df_event_timestamp_col: str, entity_df: pandas.DataFrame
):
    entity_df_columns = set(entity_df.columns.values)
    expected_columns = join_keys.copy()
    expected_columns.add(entity_df_event_timestamp_col)

    missing_keys = expected_columns - entity_df_columns

    if len(missing_keys) != 0:
        raise errors.FeastEntityDFMissingColumnsError(expected_columns, missing_keys)


def _assert_expected_columns_in_sqlserver(
    join_keys: Set[str], entity_df_event_timestamp_col: str, table_schema: EntitySchema
):
    entity_columns = set(table_schema.keys())
    expected_columns = join_keys.copy()
    expected_columns.add(entity_df_event_timestamp_col)

    missing_keys = expected_columns - entity_columns

    if len(missing_keys) != 0:
        raise errors.FeastEntityDFMissingColumnsError(expected_columns, missing_keys)


def _get_join_keys(
    project: str, feature_views: List[FeatureView], registry: BaseRegistry
) -> Set[str]:
    join_keys = set()
    for feature_view in feature_views:
        entities = feature_view.entities
        for entity_name in entities:
            entity = registry.get_entity(entity_name, project)
            join_keys.add(entity.join_key)
    return join_keys


def _infer_event_timestamp_from_sqlserver_schema(table_schema) -> str:
    if any(
        schema_field["COLUMN_NAME"] == DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
        for schema_field in table_schema
    ):
        return DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL
    else:
        datetime_columns = list(
            filter(
                lambda schema_field: schema_field["DATA_TYPE"] == "DATETIMEOFFSET",
                table_schema,
            )
        )
        if len(datetime_columns) == 1:
            print(
                f"Using {datetime_columns[0]['COLUMN_NAME']} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}."
            )
            return datetime_columns[0].name
        else:
            raise ValueError(
                f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events."
            )


[docs]class MsSqlServerRetrievalJob(RetrievalJob):
    def __init__(
        self,
        query: str,
        engine: Engine,
        config: MsSqlServerOfflineStoreConfig,
        full_feature_names: bool,
        on_demand_feature_views: Optional[List[OnDemandFeatureView]] = None,
        metadata: Optional[RetrievalMetadata] = None,
        drop_columns: Optional[List[str]] = None,
    ):
        self.query = query
        self.engine = engine
        self._config = config
        self._full_feature_names = full_feature_names
        self._on_demand_feature_views = on_demand_feature_views or []
        self._drop_columns = drop_columns
        self._metadata = metadata

    @property
    def full_feature_names(self) -> bool:
        return self._full_feature_names

    @property
    def on_demand_feature_views(self) -> List[OnDemandFeatureView]:
        return self._on_demand_feature_views

    def _to_df_internal(self, timeout: Optional[int] = None) -> pandas.DataFrame:
        return pandas.read_sql(self.query, con=self.engine).fillna(value=np.nan)

    def _to_arrow_internal(self, timeout: Optional[int] = None) -> pyarrow.Table:
        result = pandas.read_sql(self.query, con=self.engine).fillna(value=np.nan)
        return pyarrow.Table.from_pandas(result)

    ## Implements persist in Feast 0.18 - This persists to filestorage
    ## ToDo: Persist to Azure Storage
[docs]    def persist(
        self,
        storage: SavedDatasetStorage,
        allow_overwrite: Optional[bool] = False,
        timeout: Optional[int] = None,
    ):
        assert isinstance(storage, SavedDatasetFileStorage)

        filesystem, path = FileSource.create_filesystem_and_path(
            storage.file_options.uri,
            storage.file_options.s3_endpoint_override,
        )

        if path.endswith(".parquet"):
            pyarrow.parquet.write_table(
                self.to_arrow(), where=path, filesystem=filesystem
            )
        else:
            # otherwise assume destination is directory
            pyarrow.parquet.write_to_dataset(
                self.to_arrow(), root_path=path, filesystem=filesystem
            )

[docs]    def supports_remote_storage_export(self) -> bool:
        return False

[docs]    def to_remote_storage(self) -> List[str]:
        raise NotImplementedError()

    @property
    def metadata(self) -> Optional[RetrievalMetadata]:
        return self._metadata


def _upload_entity_df_into_sqlserver_and_get_entity_schema(
    engine: sqlalchemy.engine.Engine,
    config: RepoConfig,
    entity_df: Union[pandas.DataFrame, str],
    full_feature_names: bool,
) -> Tuple[Dict[Any, Any], str]:
    """
    Uploads a Pandas entity dataframe into a SQL Server table and constructs the
    schema from the original entity_df dataframe.
    """
    table_id = offline_utils.get_temp_entity_table_name()
    session = sessionmaker(bind=engine)()

    if type(entity_df) is str:
        # TODO: This should be a temporary table, right?
        session.execute(f"SELECT * INTO {table_id} FROM ({entity_df}) t")  # type: ignore

        session.commit()

        limited_entity_df = MsSqlServerRetrievalJob(
            f"SELECT TOP 1 * FROM {table_id}",
            engine,
            config.offline_store,
            full_feature_names=full_feature_names,
            on_demand_feature_views=None,
        ).to_df()

        entity_schema = (
            dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)),
            table_id,
        )

    elif isinstance(entity_df, pandas.DataFrame):
        # Drop the index so that we don't have unnecessary columns
        engine.execute(_df_to_create_table_sql(entity_df, table_id))
        entity_df.to_sql(name=table_id, con=engine, index=False, if_exists="append")
        entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)), table_id

    else:
        raise ValueError(
            f"The entity dataframe you have provided must be a SQL Server SQL query,"
            f" or a Pandas dataframe. But we found: {type(entity_df)} "
        )

    return entity_schema


def _df_to_create_table_sql(df: pandas.DataFrame, table_name: str) -> str:
    pa_table = pa.Table.from_pandas(df)

    columns = [f""""{f.name}" {pa_to_mssql_type(f.type)}""" for f in pa_table.schema]

    return f"""
        CREATE TABLE "{table_name}" (
            {", ".join(columns)}
        );
        """


def _get_entity_df_event_timestamp_range(
    entity_df: Union[pandas.DataFrame, str],
    entity_df_event_timestamp_col: str,
    engine: Engine,
) -> Tuple[datetime, datetime]:
    if isinstance(entity_df, pandas.DataFrame):
        entity_df_event_timestamp = entity_df.loc[
            :, entity_df_event_timestamp_col
        ].infer_objects()
        if pandas.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pandas.to_datetime(
                entity_df_event_timestamp, utc=True
            )
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min().to_pydatetime(),
            entity_df_event_timestamp.max().to_pydatetime(),
        )
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), determine range
        # from table
        df = pandas.read_sql(entity_df, con=engine).fillna(value=np.nan)
        entity_df_event_timestamp = df.loc[
            :, entity_df_event_timestamp_col
        ].infer_objects()
        if pandas.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pandas.to_datetime(
                entity_df_event_timestamp, utc=True
            )
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min().to_pydatetime(),
            entity_df_event_timestamp.max().to_pydatetime(),
        )
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_df_event_timestamp_range


# TODO: Optimizations
#   * Use NEWID() instead of ROW_NUMBER(), or join on entity columns directly
#   * Precompute ROW_NUMBER() so that it doesn't have to be recomputed for every query on entity_dataframe
#   * Create temporary tables instead of keeping all tables in memory

MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """
/*
 Compute a deterministic hash for the `left_table_query_string` that will be used throughout
 all the logic as the field to GROUP BY the data
*/
WITH entity_dataframe AS (
    SELECT *,
        {{entity_df_event_timestamp_col}} AS entity_timestamp
        {% for featureview in featureviews %}
            ,CONCAT(
                {% for entity_key in unique_entity_keys %}
                    {{entity_key}},
                {% endfor %}
                {{entity_df_event_timestamp_col}}
            ) AS {{featureview.name}}__entity_row_unique_id
        {% endfor %}
    FROM {{ left_table_query_string }}
),

{% for featureview in featureviews %}

{{ featureview.name }}__entity_dataframe AS (
    SELECT
        {{ featureview.entities | join(', ')}}{% if featureview.entities %},{% else %}{% endif %}
        entity_timestamp,
        {{featureview.name}}__entity_row_unique_id
    FROM entity_dataframe
    GROUP BY
        {{ featureview.entities | join(', ')}}{% if featureview.entities %},{% else %}{% endif %}
        entity_timestamp,
        {{featureview.name}}__entity_row_unique_id
),

/*
 This query template performs the point-in-time correctness join for a single feature set table
 to the provided entity table.

 1. We first join the current feature_view to the entity dataframe that has been passed.
 This JOIN has the following logic:
    - For each row of the entity dataframe, only keep the rows where the timestamp_field`
    is less than the one provided in the entity dataframe
    - If there a TTL for the current feature_view, also keep the rows where the `timestamp_field`
    is higher the the one provided minus the TTL
    - For each row, Join on the entity key and retrieve the `entity_row_unique_id` that has been
    computed previously

 The output of this CTE will contain all the necessary information and already filtered out most
 of the data that is not relevant.
*/

{{ featureview.name }}__subquery AS (
    SELECT
        {{ featureview.timestamp_field }} as event_timestamp,
        {{ featureview.created_timestamp_column ~ ' as created_timestamp,' if featureview.created_timestamp_column else '' }}
        {{ featureview.entity_selections | join(', ')}}{% if featureview.entity_selections %},{% else %}{% endif %}
        {% for feature in featureview.features %}
            {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}{% if loop.last %}{% else %}, {% endif %}
        {% endfor %}
    FROM {{ featureview.table_subquery }}
    WHERE {{ featureview.timestamp_field }} <= '{{ featureview.max_event_timestamp }}'
    {% if featureview.ttl == 0 %}{% else %}
    AND {{ featureview.timestamp_field }} >= '{{ featureview.min_event_timestamp }}'
    {% endif %}
),

{{ featureview.name }}__base AS (
    SELECT
        subquery.*,
        entity_dataframe.{{entity_df_event_timestamp_col}} AS entity_timestamp,
        entity_dataframe.{{featureview.name}}__entity_row_unique_id
    FROM {{ featureview.name }}__subquery AS subquery
    INNER JOIN entity_dataframe
        ON 1=1
        AND subquery.event_timestamp <= entity_dataframe.{{entity_df_event_timestamp_col}}

        {% if featureview.ttl == 0 %}{% else %}
        AND {{ featureview.ttl }} > = DATEDIFF(SECOND, subquery.event_timestamp, entity_dataframe.{{entity_df_event_timestamp_col}})
        {% endif %}

        {% for entity in featureview.entities %}
        AND subquery.{{ entity }} = entity_dataframe.{{ entity }}
        {% endfor %}
),

/*
 2. If the `created_timestamp_column` has been set, we need to
 deduplicate the data first. This is done by calculating the
 `MAX(created_at_timestamp)` for each event_timestamp.
 We then join the data on the next CTE
*/
{% if featureview.created_timestamp_column %}
{{ featureview.name }}__dedup AS (
    SELECT
        {{featureview.name}}__entity_row_unique_id,
        event_timestamp,
        MAX(created_timestamp) as created_timestamp
    FROM {{ featureview.name }}__base
    GROUP BY {{featureview.name}}__entity_row_unique_id, event_timestamp
),
{% endif %}

/*
 3. The data has been filtered during the first CTE "*__base"
 Thus we only need to compute the latest timestamp of each feature.
*/
{{ featureview.name }}__latest AS (
    SELECT
        {{ featureview.name }}__base.{{ featureview.name }}__entity_row_unique_id,
        MAX({{ featureview.name }}__base.event_timestamp) AS event_timestamp
        {% if featureview.created_timestamp_column %}
            ,MAX({{ featureview.name }}__base.created_timestamp) AS created_timestamp
        {% endif %}

    FROM {{ featureview.name }}__base
    {% if featureview.created_timestamp_column %}
        INNER JOIN {{ featureview.name }}__dedup
        ON {{ featureview.name }}__dedup.{{ featureview.name }}__entity_row_unique_id = {{ featureview.name }}__base.{{ featureview.name }}__entity_row_unique_id
        AND {{ featureview.name }}__dedup.event_timestamp = {{ featureview.name }}__base.event_timestamp
        AND {{ featureview.name }}__dedup.created_timestamp = {{ featureview.name }}__base.created_timestamp
    {% endif %}

    GROUP BY {{ featureview.name }}__base.{{ featureview.name }}__entity_row_unique_id
),

/*
 4. Once we know the latest value of each feature for a given timestamp,
 we can join again the data back to the original "base" dataset
*/
{{ featureview.name }}__cleaned AS (
    SELECT base.*
    FROM {{ featureview.name }}__base as base
    INNER JOIN {{ featureview.name }}__latest
    ON base.{{ featureview.name }}__entity_row_unique_id = {{ featureview.name }}__latest.{{ featureview.name }}__entity_row_unique_id
    AND base.event_timestamp = {{ featureview.name }}__latest.event_timestamp
        {% if featureview.created_timestamp_column %}
            AND base.created_timestamp = {{ featureview.name }}__latest.created_timestamp
        {% endif %}
){% if loop.last %}{% else %}, {% endif %}

{% endfor %}

/*
 Joins the outputs of multiple time travel joins to a single table.
 The entity_dataframe dataset being our source of truth here.
 */

SELECT {{ final_output_feature_names | join(', ')}}
FROM entity_dataframe
{% for featureview in featureviews %}
LEFT JOIN (
    SELECT
        {{featureview.name}}__entity_row_unique_id
        {% for feature in featureview.features %}
            ,{% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}
        {% endfor %}
    FROM "{{ featureview.name }}__cleaned"
) {{ featureview.name }}__cleaned
ON
{{ featureview.name }}__cleaned.{{ featureview.name }}__entity_row_unique_id = entity_dataframe.{{ featureview.name }}__entity_row_unique_id
{% endfor %}
"""