Skip to content

Commit 293aadd

Browse files
authored
Merge pull request #27 from ray310/004
004
2 parents b472307 + 39825fc commit 293aadd

File tree

8 files changed

+219
-60
lines changed

8 files changed

+219
-60
lines changed

.pylintrc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,5 @@ max-line-length=88
3737
check-quote-consistency=yes
3838

3939
[DESIGN]
40-
max-args=5
40+
max-args=5
41+
max-attributes=10

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,21 @@
22

33
## Unreleased
44
### Added
5+
- functionality to detect time series gaps
6+
7+
____
8+
## 0.1.0 - Unreleased
9+
### Added
10+
- Split reports module
511
- Improved project documentation
12+
13+
____
14+
## 0.0.4 - 2024-07-09
15+
### Added
16+
- Add support for improved display in Jupyter Notebooks [gh-22](https://github.com/ray310/Panda-Helper/issues/22)
17+
- Add user to select different string formats for profiles [gh-24](https://github.com/ray310/Panda-Helper/issues/24)
18+
- Allow user to specify number of most frequent and least frequent values to display in SeriesProfile [gh-25](https://github.com/ray310/Panda-Helper/issues/25)
19+
620
____
721
## 0.0.3 - 2024-07-06
822
### Added

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2024 Ray310
3+
Copyright (c) 2022-2024 Ray310
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

conda_environment_dev.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@ channels:
44
- conda-forge
55
dependencies:
66
- python=3.12
7+
- beautifulsoup4=4.12.3
78
- black=24.4.2
89
- build=0.7.0
9-
- codespell=0.0.0
10+
- codespell
1011
- coverage=7.2.2
1112
- jupyter=1.0.0
13+
- mkdocs
1214
- notebook=7.0.8
1315
- numpy=1.26.4
1416
- pandas=2.2.2
@@ -18,7 +20,6 @@ dependencies:
1820
- pylint=3.2.2
1921
- pytest=7.4.4
2022
- scipy=1.13.1
21-
- sphinx=7.3.7
2223
- twine=4.0.2
2324
- pip:
24-
- tabulate=0.9.0
25+
- tabulate==0.9.0

pyproject.toml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "panda-helper"
7-
version = "0.0.3"
7+
version = "0.0.4"
88
dependencies = [
9-
"pandas>=2.0",
10-
"scipy>=1.11",
11-
"tabulate",
9+
"beautifulsoup4>=4.12.2",
10+
"numpy>=1.26.0",
11+
"pandas>=2.1.1",
12+
"scipy>=1.11.2",
13+
"tabulate==0.9.0",
1214
]
1315
requires-python = ">=3.9"
1416
authors = [

src/pandahelper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@
99
)
1010

1111

12-
__version__ = "0.0.3"
12+
__version__ = "0.0.4"
1313
__all__ = ["DataFrameProfile", "SeriesProfile", "frequency_table"]

src/pandahelper/reports.py

Lines changed: 136 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
"""Panda-Helper Classes and associated helper functions."""
22

33
from warnings import warn
4+
import bs4
45
import numpy as np
56
import pandas as pd
67
import pandas.api.types as pat
78
import scipy.stats
89
from tabulate import tabulate
910

10-
1111
warn(
1212
"reports module is deprecated and will be removed in a future version."
1313
"'import pandahelper' will provide access to profiles and methods.",
@@ -16,6 +16,32 @@
1616
)
1717

1818

19+
def frequency_table(series):
20+
"""Return value counts and relative frequency.
21+
22+
Args:
23+
series (pd.Series): Series used to calculate value counts and relative
24+
frequencies.
25+
26+
Returns:
27+
pd.DataFrame: DataFrame containing values as the row index with value
28+
counts and counts as a percentage of total count.
29+
30+
Raises:
31+
TypeError: If input is not a pd.Series.
32+
"""
33+
if not isinstance(series, pd.Series):
34+
raise TypeError(f"{series}, is not pd.Series")
35+
freq = series.value_counts() # excludes nulls
36+
freq.name = "Count"
37+
counts = series.value_counts(normalize=True)
38+
percent = pd.Series([f"{x:.2%}" for x in counts], index=counts.index)
39+
percent.name = "% of Total"
40+
output = pd.concat([freq, percent], axis=1)
41+
output.index = [_abbreviate_string(str(x), limit=60) for x in output.index]
42+
return output.sort_values(by="Count", ascending=False)
43+
44+
1945
def _abbreviate_df(df, first=20, last=5):
2046
"""Return a shortened DataFrame or Series.
2147
@@ -161,32 +187,6 @@ def _order_stats(stats: dict):
161187
return {k: stats[k] for k in key_list}
162188

163189

164-
def frequency_table(series):
165-
"""Return value counts and relative frequency.
166-
167-
Args:
168-
series (pd.Series): Series used to calculate value counts and relative
169-
frequencies.
170-
171-
Returns:
172-
pd.DataFrame: DataFrame containing values as the row index with value
173-
counts and counts as a percentage of total count.
174-
175-
Raises:
176-
TypeError: If input is not a pd.Series.
177-
"""
178-
if not isinstance(series, pd.Series):
179-
raise TypeError(f"{series}, is not pd.Series")
180-
freq = series.value_counts() # excludes nulls
181-
freq.name = "Count"
182-
counts = series.value_counts(normalize=True)
183-
percent = pd.Series([f"{x:.2%}" for x in counts], index=counts.index)
184-
percent.name = "% of Total"
185-
output = pd.concat([freq, percent], axis=1)
186-
output.index = [_abbreviate_string(str(x), limit=60) for x in output.index]
187-
return output
188-
189-
190190
class DataFrameProfile:
191191
"""DataFrame-level data profile.
192192
@@ -199,15 +199,17 @@ class DataFrameProfile:
199199
dtypes (pd.Series): Data types of Series within DataFrame.
200200
num_duplicates (int): Number of duplicated rows.
201201
nulls_per_row (pd.Series): Count of null values per row.
202-
nulls_stats (list): Distribution statistics on nulls per row.
202+
null_stats (list): Distribution statistics on nulls per row.
203203
"""
204204

205-
def __init__(self, df, name=""):
205+
def __init__(self, df: pd.DataFrame, *, name: str = "", fmt: str = "simple"):
206206
"""Initialize DataFrameProfile.
207207
208208
Args:
209209
df (pd.DataFrame): DataFrame to profile.
210210
name (str, optional): Name to assign to profile.
211+
fmt (str: optional): Printed table format. See
212+
https://github.com/astanin/python-tabulate for options.
211213
212214
Raises:
213215
TypeError: If input is not a pd.DataFrame.
@@ -220,23 +222,47 @@ def __init__(self, df, name=""):
220222
self.num_duplicates = sum(df.duplicated(keep="first"))
221223
self.nulls_per_row = df.isna().sum(axis=1)
222224
self.null_stats = distribution_stats(self.nulls_per_row)
225+
self._format = fmt
223226

224-
def __repr__(self):
225-
"""Printable version of profile."""
227+
def __create_tables(self, table_fmt: str):
228+
"""Create DataFrameProfile summary tables.
229+
230+
Args:
231+
table_fmt (str): Tabulate table format name.
232+
233+
Returns:
234+
list(str): List of Tabulate tables.
235+
236+
"""
226237
df_info = [
227238
("DF Shape", self.shape),
228239
("Duplicated Rows", self.num_duplicates),
229240
]
230241
if self.name:
231242
df_info.insert(0, ("DF Name", self.name))
232-
df_table = tabulate(df_info, headers=["DataFrame-Level Info", ""])
233-
dtype_table = tabulate(self.dtypes, headers=["Series Name", "Data Type"])
243+
df_table = tabulate(
244+
df_info, headers=["DataFrame-Level Info", ""], tablefmt=table_fmt
245+
)
246+
dtype_table = tabulate(
247+
self.dtypes, headers=["Series Name", "Data Type"], tablefmt=table_fmt
248+
)
234249
null_table = tabulate(
235-
list(self.null_stats.items()), headers=["Summary of Nulls Per Row", ""]
250+
list(self.null_stats.items()),
251+
headers=["Summary of Nulls Per Row", ""],
252+
tablefmt=table_fmt,
236253
)
237-
output = ["".join([x, "\n\n"]) for x in [df_table, dtype_table, null_table]]
238-
output = "".join(output).strip()
239-
return output + "\n"
254+
return [df_table, dtype_table, null_table]
255+
256+
def __repr__(self):
257+
"""Printable version of profile."""
258+
output = ["".join([x, "\n\n"]) for x in self.__create_tables(self._format)]
259+
return "".join(output).strip() + "\n"
260+
261+
def _repr_html_(self):
262+
"""HTML representation of profile."""
263+
tables = [_format_html_table(t) for t in self.__create_tables("html")]
264+
tables[2] = _decimal_align_col(tables[2], 1)
265+
return tables[0] + "<br>" + tables[1] + "<br>" + tables[2]
240266

241267
def save_report(self, path):
242268
"""Save profile to provided path.
@@ -260,22 +286,33 @@ class SeriesProfile:
260286
count (int): Count of non-null values.
261287
num_unique (int): Number of unique values.
262288
num_nulls (int): Number of null values.
263-
frequency (pd.DataFrame): Table of value counts and relative frequency
264-
as a DataFrame
265-
stats (list): Distribution statistics for numeric Series.
289+
frequency (pd.DataFrame): Frequency table with counts and percentage.
290+
stats (list): Distribution statistics for Series.
266291
"""
267292

268-
def __init__(self, series):
293+
def __init__(
294+
self,
295+
series: pd.Series,
296+
*,
297+
fmt: str = "simple",
298+
freq_most_least: tuple = (20, 5),
299+
):
269300
"""Initialize SeriesProfile.
270301
271302
Args:
272303
series (pd.Series): DataFrame to profile.
304+
fmt (str: optional): Printed table format. See
305+
https://github.com/astanin/python-tabulate for options.
306+
freq_most_least (tuple: optional): Tuple (x, y) of the x most common and
307+
y least common values to display in frequency table.
273308
274309
Raises:
275310
TypeError: If input is not a pd.Series.
276311
"""
277312
if not isinstance(series, pd.Series):
278313
raise TypeError(f"{series}, is not pd.DataFrame")
314+
if freq_most_least[0] < 0 or freq_most_least[1] < 0:
315+
raise ValueError("Tuple values must be >= 0!")
279316
self.name = series.name
280317
self.dtype = series.dtype
281318
self.count = series.count() # counts non-null values
@@ -288,9 +325,19 @@ def __init__(self, series):
288325
or isinstance(self.dtype, pd.CategoricalDtype)
289326
):
290327
self.stats = distribution_stats(series)
328+
self._format = fmt
329+
self._freq_table = freq_most_least
291330

292-
def __repr__(self):
293-
"""Printable version of profile."""
331+
def __create_tables(self, table_fmt: str):
332+
"""Create SeriesProfile summary tables.
333+
334+
Args:
335+
table_fmt (str): Tabulate table format name.
336+
337+
Returns:
338+
list(str): List of Tabulate tables.
339+
340+
"""
294341
series_info = [
295342
("Data Type", self.dtype),
296343
("Count", self.count),
@@ -300,20 +347,39 @@ def __repr__(self):
300347
sname = self.name
301348
if not sname:
302349
sname = "Series"
303-
series_table = tabulate(series_info, headers=[f"{sname} Info", ""])
304-
freq_info = _abbreviate_df(self.frequency, first=20, last=5)
305-
freq_table = tabulate(freq_info, headers=["Value", "Count", "% of total"])
350+
series_table = tabulate(
351+
series_info, headers=[f"{sname} Info", ""], tablefmt=table_fmt
352+
)
353+
freq_info = _abbreviate_df(
354+
self.frequency, first=self._freq_table[0], last=self._freq_table[1]
355+
)
356+
freq_table = tabulate(
357+
freq_info, headers=["Value", "Count", "% of total"], tablefmt=table_fmt
358+
)
306359
stats_table = ""
307360
if self.stats is not None:
308361
stats = self.stats
309362
if pat.is_complex_dtype(
310363
self.dtype
311364
): # tabulate converts complex numbers to real numbers
312365
stats = {k: str(v) for k, v in self.stats.items()}
313-
stats_table = tabulate(list(stats.items()), headers=["Statistic", "Value"])
314-
output = ["".join([x, "\n\n"]) for x in [series_table, freq_table, stats_table]]
315-
output = "".join(output).strip()
316-
return output + "\n"
366+
stats_table = tabulate(
367+
list(stats.items()),
368+
headers=["Statistic", "Value"],
369+
tablefmt=table_fmt,
370+
)
371+
return [series_table, freq_table, stats_table]
372+
373+
def __repr__(self):
374+
"""Printable version of profile."""
375+
output = ["".join([x, "\n\n"]) for x in self.__create_tables(self._format)]
376+
return "".join(output).strip() + "\n"
377+
378+
def _repr_html_(self):
379+
"""HTML representation of profile."""
380+
tables = [_format_html_table(t) for t in self.__create_tables("html")]
381+
tables[2] = _decimal_align_col(tables[2], 1)
382+
return tables[0] + "<br>" + tables[1] + "<br>" + tables[2]
317383

318384
def save_report(self, path):
319385
"""Save profile to provided path.
@@ -323,3 +389,23 @@ def save_report(self, path):
323389
"""
324390
with open(path, "w+", encoding="utf-8") as fh:
325391
fh.write(str(self))
392+
393+
394+
def _format_html_table(table: str, align: str = "left", font: str = "monospace") -> str:
395+
"""Add additional formatting to HTML table prepared by tabulate."""
396+
soup = bs4.BeautifulSoup(table, "html.parser")
397+
for row in soup.find_all("tr"):
398+
tags = row.find_all(["th", "td"]) # row in thead will have 'th'
399+
for tag in tags:
400+
tag["style"] = f"font-family: {font}, monospace; text-align: {align};"
401+
return str(soup)
402+
403+
404+
def _decimal_align_col(table: str, col: int):
405+
"""Create decimal-aligned numbers in column of HTML table."""
406+
soup = bs4.BeautifulSoup(table, "html.parser")
407+
for row in soup.find_all("tr"):
408+
tags = row.find_all("td")
409+
if tags:
410+
tags[col].string = tags[col].string.replace(" ", "\u2007") # figure space
411+
return str(soup)

0 commit comments

Comments
 (0)