Source code for feast.driver_test_data

# This module generates dummy data to be used for tests and examples.
import numpy as np
import pandas as pd

from feast.infra.provider import ENTITY_DF_EVENT_TIMESTAMP_COL


[docs]def create_orders_df( customers, drivers, start_date, end_date, order_count ) -> pd.DataFrame: """ Example df generated by this function: | order_id | driver_id | customer_id | order_is_success | event_timestamp | +----------+-----------+-------------+------------------+---------------------+ | 100 | 5004 | 1007 | 0 | 2021-03-10 19:31:15 | | 101 | 5003 | 1006 | 0 | 2021-03-11 22:02:50 | | 102 | 5010 | 1005 | 0 | 2021-03-13 00:34:24 | | 103 | 5010 | 1001 | 1 | 2021-03-14 03:05:59 | """ df = pd.DataFrame() df["order_id"] = [order_id for order_id in range(100, 100 + order_count)] df["driver_id"] = np.random.choice(drivers, order_count) df["customer_id"] = np.random.choice(customers, order_count) df["order_is_success"] = np.random.randint(0, 2, size=order_count).astype(np.int32) df[ENTITY_DF_EVENT_TIMESTAMP_COL] = [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range(start=start_date, end=end_date, periods=order_count) ] df.sort_values( by=[ENTITY_DF_EVENT_TIMESTAMP_COL, "order_id", "driver_id", "customer_id"], inplace=True, ) return df
[docs]def create_driver_hourly_stats_df(drivers, start_date, end_date) -> pd.DataFrame: """ Example df generated by this function: | datetime | driver_id | conv_rate | acc_rate | avg_daily_trips | created | |------------------+-----------+-----------+----------+-----------------+------------------| | 2021-03-17 19:31 | 5010 | 0.229297 | 0.685843 | 861 | 2021-03-24 19:34 | | 2021-03-17 20:31 | 5010 | 0.781655 | 0.861280 | 769 | 2021-03-24 19:34 | | 2021-03-17 21:31 | 5010 | 0.150333 | 0.525581 | 778 | 2021-03-24 19:34 | | 2021-03-17 22:31 | 5010 | 0.951701 | 0.228883 | 570 | 2021-03-24 19:34 | | 2021-03-17 23:31 | 5010 | 0.819598 | 0.262503 | 473 | 2021-03-24 19:34 | | | ... | ... | ... | ... | | | 2021-03-24 16:31 | 5001 | 0.061585 | 0.658140 | 477 | 2021-03-24 19:34 | | 2021-03-24 17:31 | 5001 | 0.088949 | 0.303897 | 618 | 2021-03-24 19:34 | | 2021-03-24 18:31 | 5001 | 0.096652 | 0.747421 | 480 | 2021-03-24 19:34 | | 2021-03-17 19:31 | 5005 | 0.142936 | 0.707596 | 466 | 2021-03-24 19:34 | | 2021-03-17 19:31 | 5005 | 0.142936 | 0.707596 | 466 | 2021-03-24 19:34 | """ df_hourly = pd.DataFrame( { "datetime": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( start=start_date, end=end_date, freq="1H", closed="left" ) ] } ) df_all_drivers = pd.DataFrame() for driver in drivers: df_hourly_copy = df_hourly.copy() df_hourly_copy["driver_id"] = driver df_all_drivers = pd.concat([df_hourly_copy, df_all_drivers]) df_all_drivers.reset_index(drop=True, inplace=True) rows = df_all_drivers["datetime"].count() df_all_drivers["conv_rate"] = np.random.random(size=rows).astype(np.float32) df_all_drivers["acc_rate"] = np.random.random(size=rows).astype(np.float32) df_all_drivers["avg_daily_trips"] = np.random.randint(0, 1000, size=rows).astype( np.int32 ) df_all_drivers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms")) # Create duplicate rows that should be filtered by created timestamp # TODO: These duplicate rows area indirectly being filtered out by the point in time join already. We need to # inject a bad row at a timestamp where we know it will get joined to the entity dataframe, and then test that # we are actually filtering it with the created timestamp late_row = df_all_drivers.iloc[int(rows / 2)] df_all_drivers = df_all_drivers.append(late_row).append(late_row) return df_all_drivers
[docs]def create_customer_daily_profile_df(customers, start_date, end_date) -> pd.DataFrame: """ Example df generated by this function: | datetime | customer_id | current_balance | avg_passenger_count | lifetime_trip_count | created | |------------------+-------------+-----------------+---------------------+---------------------+------------------| | 2021-03-17 19:31 | 1010 | 0.889188 | 0.049057 | 412 | 2021-03-24 19:38 | | 2021-03-18 19:31 | 1010 | 0.979273 | 0.212630 | 639 | 2021-03-24 19:38 | | 2021-03-19 19:31 | 1010 | 0.976549 | 0.176881 | 70 | 2021-03-24 19:38 | | 2021-03-20 19:31 | 1010 | 0.273697 | 0.325012 | 68 | 2021-03-24 19:38 | | 2021-03-21 19:31 | 1010 | 0.438262 | 0.313009 | 192 | 2021-03-24 19:38 | | | ... | ... | ... | ... | | | 2021-03-19 19:31 | 1001 | 0.738860 | 0.857422 | 344 | 2021-03-24 19:38 | | 2021-03-20 19:31 | 1001 | 0.848397 | 0.745989 | 106 | 2021-03-24 19:38 | | 2021-03-21 19:31 | 1001 | 0.301552 | 0.185873 | 812 | 2021-03-24 19:38 | | 2021-03-22 19:31 | 1001 | 0.943030 | 0.561219 | 322 | 2021-03-24 19:38 | | 2021-03-23 19:31 | 1001 | 0.354919 | 0.810093 | 273 | 2021-03-24 19:38 | """ df_daily = pd.DataFrame( { "datetime": [ pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") for dt in pd.date_range( start=start_date, end=end_date, freq="1D", closed="left" ) ] } ) df_all_customers = pd.DataFrame() for customer in customers: df_daily_copy = df_daily.copy() df_daily_copy["customer_id"] = customer df_all_customers = pd.concat([df_daily_copy, df_all_customers]) df_all_customers.reset_index(drop=True, inplace=True) rows = df_all_customers["datetime"].count() df_all_customers["current_balance"] = np.random.random(size=rows).astype(np.float32) df_all_customers["avg_passenger_count"] = np.random.random(size=rows).astype( np.float32 ) df_all_customers["lifetime_trip_count"] = np.random.randint( 0, 1000, size=rows ).astype(np.int32) # TODO: Remove created timestamp in order to test whether its really optional df_all_customers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms")) return df_all_customers