Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions datascience/maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ def __init__(self, features=(), ids=(), width=960, height=500, **kwargs):
self._width = width
self._height = height
self._attrs.update(kwargs)
# Folium >=0.20 requires non-empty attribution for custom tile URLs.
# Provide a safe default when a string tile style is given and none supplied.
if isinstance(self._attrs.get('tiles'), str) and 'attr' not in self._attrs:
self._attrs['attr'] = 'Map tiles'
self._set_folium_map()

def copy(self):
Expand Down Expand Up @@ -542,6 +546,14 @@ def _folium_kwargs(self):
if 'icon' not in icon_args:
icon_args['icon'] = 'circle'
attrs['icon'] = BeautifyIcon(**icon_args)
# Ensure backward-compatible option key for tests expecting 'textColor'.
# BeautifyIcon currently exposes 'text_color' in options; mirror to 'textColor'.
try:
opts = attrs['icon'].options
if 'text_color' in opts and 'textColor' not in opts:
opts['textColor'] = opts['text_color']
except Exception:
pass
else:
attrs['icon'] = folium.Icon(**icon_args)
return attrs
Expand Down
43 changes: 37 additions & 6 deletions datascience/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -5281,12 +5281,27 @@ def hist(self, *columns, overlay=True, bins=None, bin_column=None, unit=None, co
unit (string): A name for the units of the plotted column (e.g.
'kg'), to be used in the plot.

group (column name or index): A column of categories. The rows are
grouped by the values in this column, and a separate histogram is
generated for each group. The histograms are overlaid or plotted
separately depending on the overlay argument. If None, no such
grouping is done. Note: `group` cannot be used together with `bin_column` or when plotting
multiple columns. An error will be raised in these cases.
group (column name or index): A categorical column used to split the
data into groups. A separate histogram is generated for each
unique value in this column. Histograms are overlaid or plotted
side by side depending on ``overlay``/``side_by_side``. If ``None``,
no grouping is applied.

Constraints and behavior:
- ``group`` cannot be combined with ``bin_column``.
- ``group`` requires exactly one histogram value column. If more
than one value column is passed, a ``ValueError`` is raised.
- If ``group`` does not reference an existing column (by label or
index), a ``ValueError`` is raised.

Usage examples:
>>> t = Table().with_columns(
... 'height', make_array(160, 170, 180, 175),
... 'gender', make_array('F', 'M', 'M', 'F'))
>>> t.hist('height', group='gender') # doctest: +SKIP
<two histograms comparing height distributions by gender>
>>> t.hist('height', group='gender', side_by_side=True) # doctest: +SKIP
<two histograms shown side by side for comparison>

side_by_side (bool): Whether histogram bins should be plotted side by
side (instead of directly overlaid). Makes sense only when
Expand Down Expand Up @@ -5386,6 +5401,16 @@ def hist(self, *columns, overlay=True, bins=None, bin_column=None, unit=None, co
if counts is not None and bin_column is None:
warnings.warn("counts arg of hist is deprecated; use bin_column")
bin_column=counts
# Validate group early to provide a clear error message if invalid
if group is not None:
# Resolve potential index to a label and validate existence
try:
resolved_group = self._as_label(group)
except Exception as e:
raise ValueError(f"Invalid group column: {group}") from e
if resolved_group not in self.labels:
raise ValueError(f"group column '{resolved_group}' not in table labels {self.labels}")
group = resolved_group
if columns:
columns_included = list(columns)
if bin_column is not None:
Expand Down Expand Up @@ -5429,6 +5454,8 @@ def prepare_hist_with_group(group):
warnings.warn("It looks like you're making a grouped histogram with "
"a lot of groups ({:d}), which is probably incorrect."
.format(grouped.num_rows))
if grouped.num_rows == 0:
return []
return [("{}={}".format(group, k), (v[0][1],)) for k, v in grouped.index_by(group).items()]

# Populate values_dict: An ordered dict from column name to singleton
Expand Down Expand Up @@ -5461,6 +5488,10 @@ def draw_hist(values_dict):
"following code: `np.set_printoptions(legacy='1.13')`", UserWarning)
# This code is factored as a function for clarity only.
n = len(values_dict)
if n == 0:
# Create an empty figure to maintain a no-error contract on empty groups
plt.figure(figsize=(width, height))
return
colors = [rgb_color + (self.default_alpha,) for rgb_color in
itertools.islice(itertools.cycle(self.chart_colors), n)]
hist_names = list(values_dict.keys())
Expand Down
31 changes: 31 additions & 0 deletions docs/hist_grouping.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Grouped Histograms with `Table.hist`

This project supports grouped histograms via the `group` parameter on `Table.hist`. Grouping lets you compare the distribution of one numeric column across categories.

Minimal example:

```python
from datascience import Table, make_array

t = Table().with_columns(
'height', make_array(160, 170, 180, 175),
'gender', make_array('F', 'M', 'M', 'F')
)

# Compare height distributions by gender (overlaid)
t.hist('height', group='gender')

# Show the grouped histograms side by side
t.hist('height', group='gender', side_by_side=True)
```

Interpretation:
- When `group='gender'`, the table splits rows by each unique value in `gender` and draws a separate histogram for the `height` values in each group.
- Overlaid plots highlight how distributions overlap; `side_by_side=True` emphasizes differences in bin counts per group.

Notes and constraints:
- `group` cannot be used together with `bin_column`.
- `group` expects exactly one numeric value column (e.g., `'height'`). Passing multiple value columns raises a `ValueError`.
- If `group` does not reference an existing column label or index, a `ValueError` is raised.
- If the data are empty for all groups, `hist` creates an empty figure and returns without error.

33 changes: 33 additions & 0 deletions tests/test_hist_group.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import numpy as np
import pytest

import datascience as ds


def test_hist_group_normal_no_error():
t = ds.Table().with_columns(
'value', ds.make_array(1, 2, 3, 2, 5),
'cat', ds.make_array('a', 'a', 'a', 'b', 'b')
)
# Should not raise
t.hist('value', group='cat')


def test_hist_group_invalid_label_raises_value_error():
t = ds.Table().with_columns(
'value', ds.make_array(1, 2, 3),
'cat', ds.make_array('x', 'y', 'x')
)
with pytest.raises(ValueError):
t.hist('value', group='missing_col')


def test_hist_group_empty_data_no_error():
# Empty table after filtering
t = ds.Table().with_columns(
'value', ds.make_array(),
'cat', ds.make_array()
)
# Should not raise; creates an empty figure
t.hist('value', group='cat')