Skip to content

feat: custom in-memory indexes#2379

Draft
ilan-gold wants to merge 2 commits intomainfrom
ig/custom_index_objects
Draft

feat: custom in-memory indexes#2379
ilan-gold wants to merge 2 commits intomainfrom
ig/custom_index_objects

Conversation

@ilan-gold
Copy link
Copy Markdown
Contributor

@ilan-gold ilan-gold commented Mar 31, 2026

From the proteomics hackathon, we definitely need to define some parameters and work out kinks:

  1. Writeability: feat: AnnData.can_write based on AnnData._reduce + iter_outer + refactorings of other relevant functions #2372 + Lenient write #2373 + Handle new zarr dtypes (both custom and datetime) #2238. Conretely, feat: AnnData.can_write based on AnnData._reduce + iter_outer + refactorings of other relevant functions #2372 needs to go in first to prevent people from even trying to write these i.e., when there is a MultiIndex
  2. What exactly are we changing? In other words, what does this setting do exactly? I think the two tests are clear about this, but there is some subtle stuff like you used to be able to do adata.obs_names = non_string_index and this evidently worked? This will be essential for documentation
  3. Do we need any guardrails? No arrow integer types? What sort of filtering can/should we do here?
  4. In the case of no index provided on declaration, should we make new decisions? https://github.com/scverse/pandas-uuid maybe?
  • Release note not necessary because:

@codecov
Copy link
Copy Markdown

codecov bot commented Mar 31, 2026

❌ 3 Tests Failed:

Tests completed Failed Passed Skipped
6130 3 6127 1608
View the top 3 failed test(s) by shortest run time
tests.test_base::test_create_from_df_with_obs_and_var
Stack Traces | 0.002s run time
#x1B[0m#x1B[94mdef#x1B[39;49;00m#x1B[90m #x1B[39;49;00m#x1B[92mtest_create_from_df_with_obs_and_var#x1B[39;49;00m():#x1B[90m#x1B[39;49;00m
        df = pd.DataFrame(np.ones((#x1B[94m3#x1B[39;49;00m, #x1B[94m2#x1B[39;49;00m)), index=[#x1B[33m"#x1B[39;49;00m#x1B[33ma#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m, #x1B[33m"#x1B[39;49;00m#x1B[33mb#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m, #x1B[33m"#x1B[39;49;00m#x1B[33mc#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m], columns=[#x1B[33m"#x1B[39;49;00m#x1B[33mA#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m, #x1B[33m"#x1B[39;49;00m#x1B[33mB#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m])#x1B[90m#x1B[39;49;00m
        obs = pd.DataFrame(np.ones((#x1B[94m3#x1B[39;49;00m, #x1B[94m1#x1B[39;49;00m)), index=df.index, columns=[#x1B[33m"#x1B[39;49;00m#x1B[33mC#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m])#x1B[90m#x1B[39;49;00m
        var = pd.DataFrame(np.ones((#x1B[94m2#x1B[39;49;00m, #x1B[94m1#x1B[39;49;00m)), index=df.columns, columns=[#x1B[33m"#x1B[39;49;00m#x1B[33mD#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m])#x1B[90m#x1B[39;49;00m
>       ad = AnnData(df, obs=obs, var=var)#x1B[90m#x1B[39;49;00m
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#x1B[90m#x1B[39;49;00m

#x1B[1m#x1B[31mtests/test_base.py#x1B[0m:155: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
#x1B[1m#x1B[.../anndata/_core/anndata.py#x1B[0m:257: in __init__
    #x1B[0m#x1B[96mself#x1B[39;49;00m._init_as_actual(#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[.../anndata/_core/anndata.py#x1B[0m:416: in _init_as_actual
    #x1B[0m#x1B[94mif#x1B[39;49;00m settings.force_str_index_when_non_numeric#x1B[90m#x1B[39;49;00m
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#x1B[90m#x1B[39;49;00m
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = SettingsManager(
	remove_unused_categories=True,
	check_uniqueness=True,
	allow_write_nullable_strings=None,
	zarr_wri...ndices_with_min_possible_dtype=False,
	auto_shard_zarr_v3=False,
	copy_on_write_X=False,
	restrict_index_types=True,

)
option = 'force_str_index_when_non_numeric'

    #x1B[0m#x1B[94mdef#x1B[39;49;00m#x1B[90m #x1B[39;49;00m#x1B[92m__getattr__#x1B[39;49;00m(#x1B[96mself#x1B[39;49;00m, option: #x1B[96mstr#x1B[39;49;00m) -> #x1B[96mobject#x1B[39;49;00m:#x1B[90m#x1B[39;49;00m
    #x1B[90m    #x1B[39;49;00m#x1B[33m"""#x1B[39;49;00m
    #x1B[33m    Gets the option's value.#x1B[39;49;00m
    #x1B[33m#x1B[39;49;00m
    #x1B[33m    Parameters#x1B[39;49;00m
    #x1B[33m    ----------#x1B[39;49;00m
    #x1B[33m    option#x1B[39;49;00m
    #x1B[33m        Option to be got.#x1B[39;49;00m
    #x1B[33m#x1B[39;49;00m
    #x1B[33m    Returns#x1B[39;49;00m
    #x1B[33m    -------#x1B[39;49;00m
    #x1B[33m    Value of the option.#x1B[39;49;00m
    #x1B[33m    """#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        #x1B[94mif#x1B[39;49;00m option #x1B[95min#x1B[39;49;00m #x1B[96mself#x1B[39;49;00m._deprecated_options:#x1B[90m#x1B[39;49;00m
            deprecated = #x1B[96mself#x1B[39;49;00m._deprecated_options[option]#x1B[90m#x1B[39;49;00m
            msg = #x1B[33mf#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[33m{#x1B[39;49;00moption#x1B[33m!r}#x1B[39;49;00m#x1B[33m will be removed in #x1B[39;49;00m#x1B[33m{#x1B[39;49;00mdeprecated.removal_version#x1B[33m}#x1B[39;49;00m#x1B[33m. #x1B[39;49;00m#x1B[33m{#x1B[39;49;00mdeprecated.message#x1B[33m}#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
            warn(msg, #x1B[96mFutureWarning#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
        #x1B[94mif#x1B[39;49;00m option #x1B[95min#x1B[39;49;00m #x1B[96mself#x1B[39;49;00m._config:#x1B[90m#x1B[39;49;00m
            #x1B[94mreturn#x1B[39;49;00m #x1B[96mself#x1B[39;49;00m._config[option]#x1B[90m#x1B[39;49;00m
        msg = #x1B[33mf#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[33m{#x1B[39;49;00moption#x1B[33m}#x1B[39;49;00m#x1B[33m not found.#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
>       #x1B[94mraise#x1B[39;49;00m #x1B[96mAttributeError#x1B[39;49;00m(msg)#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[31mE       AttributeError: force_str_index_when_non_numeric not found.#x1B[0m

#x1B[1m#x1B[31msrc/anndata/_settings.py#x1B[0m:338: AttributeError
tests.test_base::test_matching_int_index
Stack Traces | 0.002s run time
#x1B[0m#x1B[94mdef#x1B[39;49;00m#x1B[90m #x1B[39;49;00m#x1B[92mtest_matching_int_index#x1B[39;49;00m():#x1B[90m#x1B[39;49;00m
>       adata = AnnData(#x1B[90m#x1B[39;49;00m
            pd.DataFrame(#x1B[96mdict#x1B[39;49;00m(a=[#x1B[94m0.0#x1B[39;49;00m, #x1B[94m0.5#x1B[39;49;00m]), index=[#x1B[94m0#x1B[39;49;00m, #x1B[94m1#x1B[39;49;00m]), obs=pd.DataFrame(index=[#x1B[94m0#x1B[39;49;00m, #x1B[94m1#x1B[39;49;00m])#x1B[90m#x1B[39;49;00m
        )#x1B[90m#x1B[39;49;00m

#x1B[1m#x1B[31mtests/test_base.py#x1B[0m:169: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
#x1B[1m#x1B[.../anndata/_core/anndata.py#x1B[0m:257: in __init__
    #x1B[0m#x1B[96mself#x1B[39;49;00m._init_as_actual(#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[.../anndata/_core/anndata.py#x1B[0m:416: in _init_as_actual
    #x1B[0m#x1B[94mif#x1B[39;49;00m settings.force_str_index_when_non_numeric#x1B[90m#x1B[39;49;00m
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#x1B[90m#x1B[39;49;00m
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = SettingsManager(
	remove_unused_categories=True,
	check_uniqueness=True,
	allow_write_nullable_strings=None,
	zarr_wri...ndices_with_min_possible_dtype=False,
	auto_shard_zarr_v3=False,
	copy_on_write_X=False,
	restrict_index_types=True,

)
option = 'force_str_index_when_non_numeric'

    #x1B[0m#x1B[94mdef#x1B[39;49;00m#x1B[90m #x1B[39;49;00m#x1B[92m__getattr__#x1B[39;49;00m(#x1B[96mself#x1B[39;49;00m, option: #x1B[96mstr#x1B[39;49;00m) -> #x1B[96mobject#x1B[39;49;00m:#x1B[90m#x1B[39;49;00m
    #x1B[90m    #x1B[39;49;00m#x1B[33m"""#x1B[39;49;00m
    #x1B[33m    Gets the option's value.#x1B[39;49;00m
    #x1B[33m#x1B[39;49;00m
    #x1B[33m    Parameters#x1B[39;49;00m
    #x1B[33m    ----------#x1B[39;49;00m
    #x1B[33m    option#x1B[39;49;00m
    #x1B[33m        Option to be got.#x1B[39;49;00m
    #x1B[33m#x1B[39;49;00m
    #x1B[33m    Returns#x1B[39;49;00m
    #x1B[33m    -------#x1B[39;49;00m
    #x1B[33m    Value of the option.#x1B[39;49;00m
    #x1B[33m    """#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
        #x1B[94mif#x1B[39;49;00m option #x1B[95min#x1B[39;49;00m #x1B[96mself#x1B[39;49;00m._deprecated_options:#x1B[90m#x1B[39;49;00m
            deprecated = #x1B[96mself#x1B[39;49;00m._deprecated_options[option]#x1B[90m#x1B[39;49;00m
            msg = #x1B[33mf#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[33m{#x1B[39;49;00moption#x1B[33m!r}#x1B[39;49;00m#x1B[33m will be removed in #x1B[39;49;00m#x1B[33m{#x1B[39;49;00mdeprecated.removal_version#x1B[33m}#x1B[39;49;00m#x1B[33m. #x1B[39;49;00m#x1B[33m{#x1B[39;49;00mdeprecated.message#x1B[33m}#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
            warn(msg, #x1B[96mFutureWarning#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
        #x1B[94mif#x1B[39;49;00m option #x1B[95min#x1B[39;49;00m #x1B[96mself#x1B[39;49;00m._config:#x1B[90m#x1B[39;49;00m
            #x1B[94mreturn#x1B[39;49;00m #x1B[96mself#x1B[39;49;00m._config[option]#x1B[90m#x1B[39;49;00m
        msg = #x1B[33mf#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[33m{#x1B[39;49;00moption#x1B[33m}#x1B[39;49;00m#x1B[33m not found.#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
>       #x1B[94mraise#x1B[39;49;00m #x1B[96mAttributeError#x1B[39;49;00m(msg)#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[31mE       AttributeError: force_str_index_when_non_numeric not found.#x1B[0m

#x1B[1m#x1B[31msrc/anndata/_settings.py#x1B[0m:338: AttributeError
tests.test_readwrite::test_read_excel
Stack Traces | 0.38s run time
#x1B[0m#x1B[94mdef#x1B[39;49;00m#x1B[90m #x1B[39;49;00m#x1B[92mtest_read_excel#x1B[39;49;00m():#x1B[90m#x1B[39;49;00m
        #x1B[94mwith#x1B[39;49;00m warnings.catch_warnings():#x1B[90m#x1B[39;49;00m
            warnings.filterwarnings(#x1B[90m#x1B[39;49;00m
                #x1B[33m"#x1B[39;49;00m#x1B[33mignore#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
                message=#x1B[33mr#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[33mdatetime.datetime.utcnow#x1B[39;49;00m#x1B[33m\#x1B[39;49;00m#x1B[33m(#x1B[39;49;00m#x1B[33m\#x1B[39;49;00m#x1B[33m) is deprecated#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
                category=#x1B[96mDeprecationWarning#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
            )#x1B[90m#x1B[39;49;00m
>           adata = ad.io.read_excel(HERE / #x1B[33m"#x1B[39;49;00m#x1B[33mdata/excel.xlsx#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m, #x1B[33m"#x1B[39;49;00m#x1B[33mSheet1#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m, dtype=#x1B[96mint#x1B[39;49;00m)#x1B[90m#x1B[39;49;00m
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#x1B[90m#x1B[39;49;00m

#x1B[1m#x1B[31mtests/test_readwrite.py#x1B[0m:582: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
#x1B[1m#x1B[.../anndata/_io/read.py#x1B[0m:86: in read_excel
    #x1B[0m#x1B[94mreturn#x1B[39;49;00m AnnData(X, row, col)#x1B[90m#x1B[39;49;00m
           ^^^^^^^^^^^^^^^^^^^^#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[.../anndata/_core/anndata.py#x1B[0m:257: in __init__
    #x1B[0m#x1B[96mself#x1B[39;49;00m._init_as_actual(#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[.../anndata/_core/anndata.py#x1B[0m:467: in _init_as_actual
    #x1B[0m#x1B[96mself#x1B[39;49;00m._obs = _gen_dataframe(#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[31m../...../_temp/uv-python-dir/cpython-3.14.3-linux-x86_64-gnu/lib/python3.14/functools.py#x1B[0m:982: in wrapper
    #x1B[0m#x1B[94mreturn#x1B[39;49;00m dispatch(args[#x1B[94m0#x1B[39;49;00m].#x1B[91m__class__#x1B[39;49;00m)(*args, **kw)#x1B[90m#x1B[39;49;00m
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#x1B[90m#x1B[39;49;00m
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

anno = 0    HFA1_001
1    HFA1_002
2    HFA1_003
Name: Cell, dtype: str
index_names = ['obs_names', 'row_names']

    #x1B[0m#x1B[37m@_gen_dataframe#x1B[39;49;00m.register(pd.Series)#x1B[90m#x1B[39;49;00m
    #x1B[37m@_gen_dataframe#x1B[39;49;00m.register(pd.Index)#x1B[90m#x1B[39;49;00m
    #x1B[94mdef#x1B[39;49;00m#x1B[90m #x1B[39;49;00m#x1B[92m_gen_dataframe_1d#x1B[39;49;00m(#x1B[90m#x1B[39;49;00m
        anno: pd.Series | pd.Index,#x1B[90m#x1B[39;49;00m
        index_names: Iterable[#x1B[96mstr#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
        *,#x1B[90m#x1B[39;49;00m
        source: Literal[#x1B[33m"#x1B[39;49;00m#x1B[33mX#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m, #x1B[33m"#x1B[39;49;00m#x1B[33mshape#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
        attr: Literal[#x1B[33m"#x1B[39;49;00m#x1B[33mobs#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m, #x1B[33m"#x1B[39;49;00m#x1B[33mvar#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m],#x1B[90m#x1B[39;49;00m
        length: #x1B[96mint#x1B[39;49;00m | #x1B[94mNone#x1B[39;49;00m = #x1B[94mNone#x1B[39;49;00m,#x1B[90m#x1B[39;49;00m
    ):#x1B[90m#x1B[39;49;00m
        msg = #x1B[33mf#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[33mCannot convert #x1B[39;49;00m#x1B[33m{#x1B[39;49;00m#x1B[96mtype#x1B[39;49;00m(anno)#x1B[33m}#x1B[39;49;00m#x1B[33m to #x1B[39;49;00m#x1B[33m{#x1B[39;49;00mattr#x1B[33m}#x1B[39;49;00m#x1B[33m DataFrame#x1B[39;49;00m#x1B[33m"#x1B[39;49;00m#x1B[90m#x1B[39;49;00m
>       #x1B[94mraise#x1B[39;49;00m #x1B[96mValueError#x1B[39;49;00m(msg)#x1B[90m#x1B[39;49;00m
#x1B[1m#x1B[31mE       ValueError: Cannot convert <class 'pandas.Series'> to obs DataFrame#x1B[0m

#x1B[1m#x1B[.../anndata/_core/aligned_df.py#x1B[0m:112: ValueError

To view more test analytics, go to the Test Analytics Dashboard
📋 Got 3 mins? Take this short survey to help us improve Test Analytics.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant