Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ jobs:
pip install -e .
python -c "import flaml"
pip install -e .[test]
- name: On Ubuntu python 3.12, pin pandas 2 for backward compat testing
if: matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
run: |
pip install "pandas>=2.0,<3"
- name: On Ubuntu python 3.11, install pyspark 3.5.1
if: matrix.python-version == '3.11' && matrix.os == 'ubuntu-latest'
run: |
Expand Down
19 changes: 13 additions & 6 deletions flaml/automl/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
from flaml.automl.spark import DataFrame, F, Series, T, pd, ps, psDataFrame, psSeries
from flaml.automl.training_log import training_log_reader

try:
from pandas.api.types import is_datetime64_any_dtype
except ImportError:
is_datetime64_any_dtype = None

try:
from scipy.sparse import issparse, vstack
except ImportError:
Expand Down Expand Up @@ -302,7 +307,7 @@ def fit_transform(self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Ta
y = y.rename(TS_VALUE_COL)
for column in X.columns:
# sklearn\utils\validation.py needs int/float values
if X[column].dtype.name in ("object", "category", "string"):
if X[column].dtype.name in ("object", "category", "string", "str"):
if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum():
X.drop(columns=column, inplace=True)
drop = True
Expand All @@ -318,7 +323,7 @@ def fit_transform(self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Ta
X.drop(columns=column, inplace=True)
drop = True
else: # datetime or numeric
if X[column].dtype.name == "datetime64[ns]":
if is_datetime64_any_dtype is not None and is_datetime64_any_dtype(X[column]):
tmp_dt = X[column].dt
new_columns_dict = {
f"year_{column}": tmp_dt.year,
Expand Down Expand Up @@ -347,9 +352,11 @@ def fit_transform(self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Ta
X[cat_columns] = X[cat_columns].astype("category")
if num_columns:
X_num = X[num_columns]
if np.issubdtype(X_num.columns.dtype, np.integer) and (
drop or min(X_num.columns) != 0 or max(X_num.columns) != X_num.shape[1] - 1
):
try:
is_int_cols = np.issubdtype(X_num.columns.dtype, np.integer)
except TypeError:
is_int_cols = False
if is_int_cols and (drop or min(X_num.columns) != 0 or max(X_num.columns) != X_num.shape[1] - 1):
X_num.columns = range(X_num.shape[1])
drop = True
else:
Expand Down Expand Up @@ -435,7 +442,7 @@ def transform(self, X: Union[DataFrame, np.array]):
if self._task.is_ts_forecast():
X.insert(0, TS_TIMESTAMP_COL, ds_col)
for column in cat_columns:
if X[column].dtype.name == "object":
if X[column].dtype.name in ("object", "string", "str"):
X[column] = X[column].fillna("__NAN__")
elif X[column].dtype.name == "category":
current_categories = X[column].cat.categories
Expand Down
1 change: 1 addition & 0 deletions flaml/automl/task/generic_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def validate_data(
assert X[column].dtype.name in (
"object",
"string",
"str",
), "If the task is an NLP task, X can only contain text columns"
for _, each_cell in X[column].items():
if each_cell is not None:
Expand Down
7 changes: 4 additions & 3 deletions flaml/automl/time_series/ts_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class PD:
pd.DataFrame = None
pd.Series = None
DataFrame = Series = None
is_datetime64_any_dtype = None


# dataclass will remove empty default value even with field(default_factory=lambda: [])
Expand Down Expand Up @@ -272,7 +273,7 @@ def enrich_dataframe(

new_cols = []
for col in df.columns:
if df[col].dtype.name == "datetime64[ns]":
if is_datetime64_any_dtype is not None and is_datetime64_any_dtype(df[col]):
extras = monthly_fourier_features(df[col], fourier_degree)
Comment thread
thinkall marked this conversation as resolved.
extras.columns = [f"{col}_{c}" for c in extras.columns]
extras.index = df.index
Expand Down Expand Up @@ -403,12 +404,12 @@ def fit(self, X: Union[DataFrame, np.array], y):
continue

# Robust datetime detection (covers datetime64[ms/us/ns], tz-aware, etc.)
if is_datetime64_any_dtype(X[column]):
if is_datetime64_any_dtype is not None and is_datetime64_any_dtype(X[column]):
self.datetime_columns.append(column)
continue

# sklearn/utils/validation.py needs int/float values
if X[column].dtype.name in ("object", "category", "string"):
if X[column].dtype.name in ("object", "category", "string", "str"):
if (
Comment thread
thinkall marked this conversation as resolved.
# drop columns where all values are the same
X[column].nunique() == 1
Expand Down
4 changes: 2 additions & 2 deletions test/automl/test_extra_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@ def load_multi_dataset():
df["timeStamp"] = pd.to_datetime(df["timeStamp"])
df = df.set_index("timeStamp")
df = df.resample("D").mean()
df["temp"] = df["temp"].fillna(method="ffill")
df["precip"] = df["precip"].fillna(method="ffill")
df["temp"] = df["temp"].ffill()
df["precip"] = df["precip"].ffill()
df = df[:-2] # last two rows are NaN for 'demand' column so remove them
df = df.reset_index()

Expand Down
6 changes: 3 additions & 3 deletions test/automl/test_forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def test_numpy_large():

from flaml import AutoML

X_train = pd.date_range("2017-01-01", periods=70000, freq="T")
X_train = pd.date_range("2017-01-01", periods=70000, freq="min")
y_train = pd.DataFrame(np.random.randint(6500, 7500, 70000))
automl = AutoML()
automl.fit(
Expand All @@ -187,8 +187,8 @@ def load_multi_dataset():
df["timeStamp"] = pd.to_datetime(df["timeStamp"])
df = df.set_index("timeStamp")
df = df.resample("D").mean()
df["temp"] = df["temp"].fillna(method="ffill")
df["precip"] = df["precip"].fillna(method="ffill")
df["temp"] = df["temp"].ffill()
df["precip"] = df["precip"].ffill()
df = df[:-2] # last two rows are NaN for 'demand' column so remove them
df = df.reset_index()

Expand Down
2 changes: 1 addition & 1 deletion test/automl/test_max_iter_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


def test_max_iter_1():
date_rng = pd.date_range(start="2024-01-01", periods=100, freq="H")
date_rng = pd.date_range(start="2024-01-01", periods=100, freq="h")
X = pd.DataFrame({"ds": date_rng})
y_train_24h = np.random.rand(len(X)) * 100

Expand Down
Loading