# This module generates dummy data to be used for tests and examples.
import numpy as np
import pandas as pd
from feast.infra.provider import ENTITY_DF_EVENT_TIMESTAMP_COL
[docs]def create_orders_df(
customers, drivers, start_date, end_date, order_count
) -> pd.DataFrame:
"""
Example df generated by this function:
| order_id | driver_id | customer_id | order_is_success | event_timestamp |
+----------+-----------+-------------+------------------+---------------------+
| 100 | 5004 | 1007 | 0 | 2021-03-10 19:31:15 |
| 101 | 5003 | 1006 | 0 | 2021-03-11 22:02:50 |
| 102 | 5010 | 1005 | 0 | 2021-03-13 00:34:24 |
| 103 | 5010 | 1001 | 1 | 2021-03-14 03:05:59 |
"""
df = pd.DataFrame()
df["order_id"] = [order_id for order_id in range(100, 100 + order_count)]
df["driver_id"] = np.random.choice(drivers, order_count)
df["customer_id"] = np.random.choice(customers, order_count)
df["order_is_success"] = np.random.randint(0, 2, size=order_count).astype(np.int32)
df[ENTITY_DF_EVENT_TIMESTAMP_COL] = [
pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
for dt in pd.date_range(start=start_date, end=end_date, periods=order_count)
]
df.sort_values(
by=[ENTITY_DF_EVENT_TIMESTAMP_COL, "order_id", "driver_id", "customer_id"],
inplace=True,
)
return df
[docs]def create_driver_hourly_stats_df(drivers, start_date, end_date) -> pd.DataFrame:
"""
Example df generated by this function:
| datetime | driver_id | conv_rate | acc_rate | avg_daily_trips | created |
|------------------+-----------+-----------+----------+-----------------+------------------|
| 2021-03-17 19:31 | 5010 | 0.229297 | 0.685843 | 861 | 2021-03-24 19:34 |
| 2021-03-17 20:31 | 5010 | 0.781655 | 0.861280 | 769 | 2021-03-24 19:34 |
| 2021-03-17 21:31 | 5010 | 0.150333 | 0.525581 | 778 | 2021-03-24 19:34 |
| 2021-03-17 22:31 | 5010 | 0.951701 | 0.228883 | 570 | 2021-03-24 19:34 |
| 2021-03-17 23:31 | 5010 | 0.819598 | 0.262503 | 473 | 2021-03-24 19:34 |
| | ... | ... | ... | ... | |
| 2021-03-24 16:31 | 5001 | 0.061585 | 0.658140 | 477 | 2021-03-24 19:34 |
| 2021-03-24 17:31 | 5001 | 0.088949 | 0.303897 | 618 | 2021-03-24 19:34 |
| 2021-03-24 18:31 | 5001 | 0.096652 | 0.747421 | 480 | 2021-03-24 19:34 |
| 2021-03-17 19:31 | 5005 | 0.142936 | 0.707596 | 466 | 2021-03-24 19:34 |
| 2021-03-17 19:31 | 5005 | 0.142936 | 0.707596 | 466 | 2021-03-24 19:34 |
"""
df_hourly = pd.DataFrame(
{
"datetime": [
pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
for dt in pd.date_range(
start=start_date, end=end_date, freq="1H", closed="left"
)
]
}
)
df_all_drivers = pd.DataFrame()
for driver in drivers:
df_hourly_copy = df_hourly.copy()
df_hourly_copy["driver_id"] = driver
df_all_drivers = pd.concat([df_hourly_copy, df_all_drivers])
df_all_drivers.reset_index(drop=True, inplace=True)
rows = df_all_drivers["datetime"].count()
df_all_drivers["conv_rate"] = np.random.random(size=rows).astype(np.float32)
df_all_drivers["acc_rate"] = np.random.random(size=rows).astype(np.float32)
df_all_drivers["avg_daily_trips"] = np.random.randint(0, 1000, size=rows).astype(
np.int32
)
df_all_drivers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms"))
# Create duplicate rows that should be filtered by created timestamp
# TODO: These duplicate rows area indirectly being filtered out by the point in time join already. We need to
# inject a bad row at a timestamp where we know it will get joined to the entity dataframe, and then test that
# we are actually filtering it with the created timestamp
late_row = df_all_drivers.iloc[int(rows / 2)]
df_all_drivers = df_all_drivers.append(late_row).append(late_row)
return df_all_drivers
[docs]def create_customer_daily_profile_df(customers, start_date, end_date) -> pd.DataFrame:
"""
Example df generated by this function:
| datetime | customer_id | current_balance | avg_passenger_count | lifetime_trip_count | created |
|------------------+-------------+-----------------+---------------------+---------------------+------------------|
| 2021-03-17 19:31 | 1010 | 0.889188 | 0.049057 | 412 | 2021-03-24 19:38 |
| 2021-03-18 19:31 | 1010 | 0.979273 | 0.212630 | 639 | 2021-03-24 19:38 |
| 2021-03-19 19:31 | 1010 | 0.976549 | 0.176881 | 70 | 2021-03-24 19:38 |
| 2021-03-20 19:31 | 1010 | 0.273697 | 0.325012 | 68 | 2021-03-24 19:38 |
| 2021-03-21 19:31 | 1010 | 0.438262 | 0.313009 | 192 | 2021-03-24 19:38 |
| | ... | ... | ... | ... | |
| 2021-03-19 19:31 | 1001 | 0.738860 | 0.857422 | 344 | 2021-03-24 19:38 |
| 2021-03-20 19:31 | 1001 | 0.848397 | 0.745989 | 106 | 2021-03-24 19:38 |
| 2021-03-21 19:31 | 1001 | 0.301552 | 0.185873 | 812 | 2021-03-24 19:38 |
| 2021-03-22 19:31 | 1001 | 0.943030 | 0.561219 | 322 | 2021-03-24 19:38 |
| 2021-03-23 19:31 | 1001 | 0.354919 | 0.810093 | 273 | 2021-03-24 19:38 |
"""
df_daily = pd.DataFrame(
{
"datetime": [
pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
for dt in pd.date_range(
start=start_date, end=end_date, freq="1D", closed="left"
)
]
}
)
df_all_customers = pd.DataFrame()
for customer in customers:
df_daily_copy = df_daily.copy()
df_daily_copy["customer_id"] = customer
df_all_customers = pd.concat([df_daily_copy, df_all_customers])
df_all_customers.reset_index(drop=True, inplace=True)
rows = df_all_customers["datetime"].count()
df_all_customers["current_balance"] = np.random.random(size=rows).astype(np.float32)
df_all_customers["avg_passenger_count"] = np.random.random(size=rows).astype(
np.float32
)
df_all_customers["lifetime_trip_count"] = np.random.randint(
0, 1000, size=rows
).astype(np.int32)
# TODO: Remove created timestamp in order to test whether its really optional
df_all_customers["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms"))
return df_all_customers