Skip to content

Commit d0b53be

Browse files
authored
Merge pull request #28 from openscm/26-multi-update
Add multi-level sources to update_levels_from_other
2 parents e419123 + 661b097 commit d0b53be

File tree

4 files changed

+307
-24
lines changed

4 files changed

+307
-24
lines changed

changelog/28.improvement.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[pandas_openscm.index_manipulation.update_levels_from_other][] now supports updating levels based on multiple other levels from the index at once (see the docstring for examples).
2+
This update also propagates to [pandas_openscm.index_manipulation.update_index_levels_from_other_func][] and [pandas_openscm.accessors.dataframe.PandasDataFrameOpenSCMAccessor.update_index_levels_from_other][].

src/pandas_openscm/accessors/dataframe.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -838,7 +838,17 @@ def update_index_levels(
838838
def update_index_levels_from_other(
839839
self,
840840
update_sources: dict[
841-
Any, tuple[Any, Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any]]
841+
Any,
842+
tuple[
843+
Any,
844+
Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any],
845+
]
846+
| tuple[
847+
tuple[Any, ...],
848+
Callable[[tuple[Any, ...]], Any]
849+
| dict[tuple[Any, ...], Any]
850+
| pd.Series[Any],
851+
],
842852
],
843853
copy: bool = True,
844854
remove_unused_levels: bool = True,
@@ -849,18 +859,29 @@ def update_index_levels_from_other(
849859
Parameters
850860
----------
851861
update_sources
852-
Updates to apply to `df`'s index
862+
Updates to apply to the data's index
853863
854864
Each key is the level to which the updates will be applied
855865
(or the level that will be created if it doesn't already exist).
856866
857-
Each value is a tuple of which the first element
867+
There are two options for the values.
868+
869+
The first is used when only one level is used to update the 'target level'.
870+
In this case, each value is a tuple of which the first element
858871
is the level to use to generate the values (the 'source level')
859872
and the second is mapper of the form used by
860873
[pd.Index.map][pandas.Index.map]
861874
which will be applied to the source level
862875
to update/create the level of interest.
863876
877+
Each value is a tuple of which the first element
878+
is the level or levels (if a tuple)
879+
to use to generate the values (the 'source level')
880+
and the second is mapper of the form used by
881+
[pd.Index.map][pandas.Index.map]
882+
which will be applied to the source level
883+
to update/create the level of interest.
884+
864885
copy
865886
Should the [pd.DataFrame][pandas.DataFrame] be copied before returning?
866887

src/pandas_openscm/index_manipulation.py

Lines changed: 155 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,53 @@ def create_new_level_and_codes_by_mapping(
405405
return new_level, new_codes
406406

407407

408+
def create_new_level_and_codes_by_mapping_multiple(
409+
ini: pd.MultiIndex,
410+
levels_to_create_from: tuple[str, ...],
411+
mapper: Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any],
412+
) -> tuple[pd.Index[Any], npt.NDArray[np.integer[Any]]]:
413+
"""
414+
Create a new level and associated codes by mapping existing levels
415+
416+
This is a thin function intended for internal use
417+
to handle some slightly tricky logic.
418+
419+
Parameters
420+
----------
421+
ini
422+
Input index
423+
424+
levels_to_create_from
425+
Levels to create the new level from
426+
427+
mapper
428+
Function to use to map existing levels to new levels
429+
430+
Returns
431+
-------
432+
new_level :
433+
New level
434+
435+
new_codes :
436+
New codes
437+
"""
438+
# You could probably do some optimisation here
439+
# that checks for unique combinations of codes
440+
# for the levels we're using,
441+
# then only applies the mapping to those unique combos
442+
# to reduce the number of evaluations of mapper.
443+
# That feels tricky to get right, so just doing the brute force way for now.
444+
dup_level = ini.droplevel(
445+
ini.names.difference(list(levels_to_create_from)) # type: ignore # pandas-stubs confused
446+
).map(mapper)
447+
448+
# Brute force: get codes from new levels
449+
new_level = dup_level.unique()
450+
new_codes = new_level.get_indexer(dup_level)
451+
452+
return new_level, new_codes
453+
454+
408455
def update_index_levels_func(
409456
df: pd.DataFrame,
410457
updates: Mapping[Any, Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any]],
@@ -564,7 +611,17 @@ def update_levels(
564611
def update_index_levels_from_other_func(
565612
df: pd.DataFrame,
566613
update_sources: dict[
567-
Any, tuple[Any, Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any]]
614+
Any,
615+
tuple[
616+
Any,
617+
Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any],
618+
]
619+
| tuple[
620+
tuple[Any, ...],
621+
Callable[[tuple[Any, ...]], Any]
622+
| dict[tuple[Any, ...], Any]
623+
| pd.Series[Any],
624+
],
568625
],
569626
copy: bool = True,
570627
remove_unused_levels: bool = True,
@@ -586,13 +643,24 @@ def update_index_levels_from_other_func(
586643
Each key is the level to which the updates will be applied
587644
(or the level that will be created if it doesn't already exist).
588645
589-
Each value is a tuple of which the first element
646+
There are two options for the values.
647+
648+
The first is used when only one level is used to update the 'target level'.
649+
In this case, each value is a tuple of which the first element
590650
is the level to use to generate the values (the 'source level')
591651
and the second is mapper of the form used by
592652
[pd.Index.map][pandas.Index.map]
593653
which will be applied to the source level
594654
to update/create the level of interest.
595655
656+
Each value is a tuple of which the first element
657+
is the level or levels (if a tuple)
658+
to use to generate the values (the 'source level')
659+
and the second is mapper of the form used by
660+
[pd.Index.map][pandas.Index.map]
661+
which will be applied to the source level
662+
to update/create the level of interest.
663+
596664
copy
597665
Should `df` be copied before returning?
598666
@@ -629,7 +697,17 @@ def update_index_levels_from_other_func(
629697
def update_levels_from_other(
630698
ini: pd.MultiIndex,
631699
update_sources: dict[
632-
Any, tuple[Any, Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any]]
700+
Any,
701+
tuple[
702+
Any,
703+
Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any],
704+
]
705+
| tuple[
706+
tuple[Any, ...],
707+
Callable[[tuple[Any, ...]], Any]
708+
| dict[tuple[Any, ...], Any]
709+
| pd.Series[Any],
710+
],
633711
],
634712
remove_unused_levels: bool = True,
635713
) -> pd.MultiIndex:
@@ -650,13 +728,24 @@ def update_levels_from_other(
650728
Each key is the level to which the updates will be applied
651729
(or the level that will be created if it doesn't already exist).
652730
653-
Each value is a tuple of which the first element
731+
There are two options for the values.
732+
733+
The first is used when only one level is used to update the 'target level'.
734+
In this case, each value is a tuple of which the first element
654735
is the level to use to generate the values (the 'source level')
655736
and the second is mapper of the form used by
656737
[pd.Index.map][pandas.Index.map]
657738
which will be applied to the source level
658739
to update/create the level of interest.
659740
741+
Each value is a tuple of which the first element
742+
is the level or levels (if a tuple)
743+
to use to generate the values (the 'source level')
744+
and the second is mapper of the form used by
745+
[pd.Index.map][pandas.Index.map]
746+
which will be applied to the source level
747+
to update/create the level of interest.
748+
660749
remove_unused_levels
661750
Call `ini.remove_unused_levels` before updating the levels
662751
@@ -718,6 +807,19 @@ def update_levels_from_other(
718807
('sa', 'model sa', 'v2', 'km')],
719808
names=['scenario', 'model', 'variable', 'unit'])
720809
>>>
810+
>>> # Create a new level based on multiple existing levels
811+
>>> update_levels_from_other(
812+
... start,
813+
... {
814+
... "model || scenario": (("model", "scenario"), lambda x: " || ".join(x)),
815+
... },
816+
... )
817+
MultiIndex([('sa', 'ma', 'v1', 'kg', 'sa || ma'),
818+
('sb', 'ma', 'v2', 'm', 'sb || ma'),
819+
('sa', 'mb', 'v1', 'kg', 'sa || mb'),
820+
('sa', 'mb', 'v2', 'm', 'sa || mb')],
821+
names=['scenario', 'model', 'variable', 'unit', 'model || scenario'])
822+
>>>
721823
>>> # Both at the same time
722824
>>> update_levels_from_other(
723825
... start,
@@ -731,7 +833,28 @@ def update_levels_from_other(
731833
('sa', 'mb', 'v1', nan, 'Sa'),
732834
('sa', 'mb', 'v2', nan, 'Sa')],
733835
names=['scenario', 'model', 'variable', 'unit', 'title'])
734-
"""
836+
>>>
837+
>>> # Setting with a range of different methods
838+
>>> update_levels_from_other(
839+
... start,
840+
... {
841+
... # callable
842+
... "y-label": (("variable", "unit"), lambda x: f"{x[0]} ({x[1]})"),
843+
... # dict
844+
... "title": ("scenario", {"sa": "Scenario A", "sb": "Delta"}),
845+
... # pd.Series
846+
... "Source": (
847+
... "model",
848+
... pd.Series(["Internal", "External"], index=["ma", "mb"]),
849+
... ),
850+
... },
851+
... )
852+
MultiIndex([('sa', 'ma', 'v1', 'kg', 'v1 (kg)', 'Scenario A', 'Internal'),
853+
('sb', 'ma', 'v2', 'm', 'v2 (m)', 'Delta', 'Internal'),
854+
('sa', 'mb', 'v1', 'kg', 'v1 (kg)', 'Scenario A', 'External'),
855+
('sa', 'mb', 'v2', 'm', 'v2 (m)', 'Scenario A', 'External')],
856+
names=['scenario', 'model', 'variable', 'unit', 'y-label', 'title', 'Source'])
857+
""" # noqa: E501
735858
if remove_unused_levels:
736859
ini = ini.remove_unused_levels() # type: ignore
737860

@@ -740,17 +863,35 @@ def update_levels_from_other(
740863
names: list[str] = list(ini.names)
741864

742865
for level, (source, updater) in update_sources.items():
743-
if source not in ini.names:
744-
msg = (
745-
f"{source} is not available in the index. Available levels: {ini.names}"
866+
if isinstance(source, tuple):
867+
missing_levels = set(source) - set(ini.names)
868+
if missing_levels:
869+
conj = "is" if len(missing_levels) == 1 else "are"
870+
msg = (
871+
f"{sorted(missing_levels)} {conj} not available in the index. "
872+
f"Available levels: {ini.names}"
873+
)
874+
raise KeyError(msg)
875+
876+
new_level, new_codes = create_new_level_and_codes_by_mapping_multiple(
877+
ini=ini,
878+
levels_to_create_from=source,
879+
mapper=updater,
746880
)
747-
raise KeyError(msg)
748881

749-
new_level, new_codes = create_new_level_and_codes_by_mapping(
750-
ini=ini,
751-
level_to_create_from=source,
752-
mapper=updater,
753-
)
882+
else:
883+
if source not in ini.names:
884+
msg = (
885+
f"{source} is not available in the index. "
886+
f"Available levels: {ini.names}"
887+
)
888+
raise KeyError(msg)
889+
890+
new_level, new_codes = create_new_level_and_codes_by_mapping(
891+
ini=ini,
892+
level_to_create_from=source,
893+
mapper=updater,
894+
)
754895

755896
if level in ini.names:
756897
level_idx = ini.names.index(level)

0 commit comments

Comments
 (0)