Merge pull request #27 from ray310/004

ray310 · web-flow · commit 293aadd1db7f · 2024-07-09T10:38:50.000-05:00
004
diff --git a/.pylintrc b/.pylintrc
@@ -37,4 +37,5 @@ max-line-length=88
 check-quote-consistency=yes
 
 [DESIGN]
-max-args=5
+max-args=5
+max-attributes=10
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,7 +2,21 @@
 
 ## Unreleased
 ### Added
+- functionality to detect time series gaps
+
+____
+## 0.1.0 - Unreleased
+### Added
+- Split reports module
 - Improved project documentation
+
+____
+## 0.0.4 - 2024-07-09
+### Added
+- Add support for improved display in Jupyter Notebooks [gh-22](https://github.com/ray310/Panda-Helper/issues/22)
+- Add user to select different string formats for profiles [gh-24](https://github.com/ray310/Panda-Helper/issues/24)
+- Allow user to specify number of most frequent and least frequent values to display in SeriesProfile [gh-25](https://github.com/ray310/Panda-Helper/issues/25)
+
 ____
 ## 0.0.3 - 2024-07-06
 ### Added
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 Ray310
+Copyright (c) 2022-2024 Ray310
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/conda_environment_dev.yaml b/conda_environment_dev.yaml
@@ -4,11 +4,13 @@ channels:
     - conda-forge
 dependencies:
     - python=3.12
+    - beautifulsoup4=4.12.3
     - black=24.4.2
     - build=0.7.0
-    - codespell=0.0.0
+    - codespell
     - coverage=7.2.2
     - jupyter=1.0.0
+    - mkdocs
     - notebook=7.0.8
     - numpy=1.26.4
     - pandas=2.2.2
@@ -18,7 +20,6 @@ dependencies:
     - pylint=3.2.2
     - pytest=7.4.4
     - scipy=1.13.1
-    - sphinx=7.3.7
     - twine=4.0.2
     - pip:
-        - tabulate=0.9.0
+        - tabulate==0.9.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,11 +4,13 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "panda-helper"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
-  "pandas>=2.0",
-  "scipy>=1.11",
-  "tabulate",
+    "beautifulsoup4>=4.12.2",
+    "numpy>=1.26.0",
+    "pandas>=2.1.1",
+    "scipy>=1.11.2",
+    "tabulate==0.9.0",
 ]
 requires-python = ">=3.9"
 authors = [
diff --git a/src/pandahelper/__init__.py b/src/pandahelper/__init__.py
@@ -9,5 +9,5 @@
 )
 
 
-__version__ = "0.0.3"
+__version__ = "0.0.4"
 __all__ = ["DataFrameProfile", "SeriesProfile", "frequency_table"]
diff --git a/src/pandahelper/reports.py b/src/pandahelper/reports.py
@@ -1,13 +1,13 @@
 """Panda-Helper Classes and associated helper functions."""
 
 from warnings import warn
+import bs4
 import numpy as np
 import pandas as pd
 import pandas.api.types as pat
 import scipy.stats
 from tabulate import tabulate
 
-
 warn(
     "reports module is deprecated and will be removed in a future version."
     "'import pandahelper' will provide access to profiles and methods.",
@@ -16,6 +16,32 @@
 )
 
 
+def frequency_table(series):
+    """Return value counts and relative frequency.
+
+    Args:
+        series (pd.Series): Series used to calculate value counts and relative
+            frequencies.
+
+    Returns:
+        pd.DataFrame: DataFrame containing values as the row index with value
+            counts and counts as a percentage of total count.
+
+    Raises:
+        TypeError: If input is not a pd.Series.
+    """
+    if not isinstance(series, pd.Series):
+        raise TypeError(f"{series}, is not pd.Series")
+    freq = series.value_counts()  # excludes nulls
+    freq.name = "Count"
+    counts = series.value_counts(normalize=True)
+    percent = pd.Series([f"{x:.2%}" for x in counts], index=counts.index)
+    percent.name = "% of Total"
+    output = pd.concat([freq, percent], axis=1)
+    output.index = [_abbreviate_string(str(x), limit=60) for x in output.index]
+    return output.sort_values(by="Count", ascending=False)
+
+
 def _abbreviate_df(df, first=20, last=5):
     """Return a shortened DataFrame or Series.
 
@@ -161,32 +187,6 @@ def _order_stats(stats: dict):
     return {k: stats[k] for k in key_list}
 
 
-def frequency_table(series):
-    """Return value counts and relative frequency.
-
-    Args:
-        series (pd.Series): Series used to calculate value counts and relative
-            frequencies.
-
-    Returns:
-        pd.DataFrame: DataFrame containing values as the row index with value
-            counts and counts as a percentage of total count.
-
-    Raises:
-        TypeError: If input is not a pd.Series.
-    """
-    if not isinstance(series, pd.Series):
-        raise TypeError(f"{series}, is not pd.Series")
-    freq = series.value_counts()  # excludes nulls
-    freq.name = "Count"
-    counts = series.value_counts(normalize=True)
-    percent = pd.Series([f"{x:.2%}" for x in counts], index=counts.index)
-    percent.name = "% of Total"
-    output = pd.concat([freq, percent], axis=1)
-    output.index = [_abbreviate_string(str(x), limit=60) for x in output.index]
-    return output
-
-
 class DataFrameProfile:
     """DataFrame-level data profile.
 
@@ -199,15 +199,17 @@ class DataFrameProfile:
         dtypes (pd.Series): Data types of Series within DataFrame.
         num_duplicates (int): Number of duplicated rows.
         nulls_per_row (pd.Series): Count of null values per row.
-        nulls_stats (list): Distribution statistics on nulls per row.
+        null_stats (list): Distribution statistics on nulls per row.
     """
 
-    def __init__(self, df, name=""):
+    def __init__(self, df: pd.DataFrame, *, name: str = "", fmt: str = "simple"):
         """Initialize DataFrameProfile.
 
         Args:
             df (pd.DataFrame): DataFrame to profile.
             name (str, optional): Name to assign to profile.
+            fmt (str: optional): Printed table format. See
+                https://github.com/astanin/python-tabulate for options.
 
         Raises:
             TypeError: If input is not a pd.DataFrame.
@@ -220,23 +222,47 @@ def __init__(self, df, name=""):
         self.num_duplicates = sum(df.duplicated(keep="first"))
         self.nulls_per_row = df.isna().sum(axis=1)
         self.null_stats = distribution_stats(self.nulls_per_row)
+        self._format = fmt
 
-    def __repr__(self):
-        """Printable version of profile."""
+    def __create_tables(self, table_fmt: str):
+        """Create DataFrameProfile summary tables.
+
+        Args:
+            table_fmt (str): Tabulate table format name.
+
+        Returns:
+            list(str): List of Tabulate tables.
+
+        """
         df_info = [
             ("DF Shape", self.shape),
             ("Duplicated Rows", self.num_duplicates),
         ]
         if self.name:
             df_info.insert(0, ("DF Name", self.name))
-        df_table = tabulate(df_info, headers=["DataFrame-Level Info", ""])
-        dtype_table = tabulate(self.dtypes, headers=["Series Name", "Data Type"])
+        df_table = tabulate(
+            df_info, headers=["DataFrame-Level Info", ""], tablefmt=table_fmt
+        )
+        dtype_table = tabulate(
+            self.dtypes, headers=["Series Name", "Data Type"], tablefmt=table_fmt
+        )
         null_table = tabulate(
-            list(self.null_stats.items()), headers=["Summary of Nulls Per Row", ""]
+            list(self.null_stats.items()),
+            headers=["Summary of Nulls Per Row", ""],
+            tablefmt=table_fmt,
         )
-        output = ["".join([x, "\n\n"]) for x in [df_table, dtype_table, null_table]]
-        output = "".join(output).strip()
-        return output + "\n"
+        return [df_table, dtype_table, null_table]
+
+    def __repr__(self):
+        """Printable version of profile."""
+        output = ["".join([x, "\n\n"]) for x in self.__create_tables(self._format)]
+        return "".join(output).strip() + "\n"
+
+    def _repr_html_(self):
+        """HTML representation of profile."""
+        tables = [_format_html_table(t) for t in self.__create_tables("html")]
+        tables[2] = _decimal_align_col(tables[2], 1)
+        return tables[0] + "<br>" + tables[1] + "<br>" + tables[2]
 
     def save_report(self, path):
         """Save profile to provided path.
@@ -260,22 +286,33 @@ class SeriesProfile:
         count (int): Count of non-null values.
         num_unique (int): Number of unique values.
         num_nulls (int): Number of null values.
-        frequency (pd.DataFrame): Table of value counts and relative frequency
-            as a DataFrame
-        stats (list): Distribution statistics for numeric Series.
+        frequency (pd.DataFrame): Frequency table with counts and percentage.
+        stats (list): Distribution statistics for Series.
     """
 
-    def __init__(self, series):
+    def __init__(
+        self,
+        series: pd.Series,
+        *,
+        fmt: str = "simple",
+        freq_most_least: tuple = (20, 5),
+    ):
         """Initialize SeriesProfile.
 
         Args:
             series (pd.Series): DataFrame to profile.
+            fmt (str: optional): Printed table format. See
+                https://github.com/astanin/python-tabulate for options.
+            freq_most_least (tuple: optional): Tuple (x, y) of the x most common and
+            y least common values to display in frequency table.
 
         Raises:
             TypeError: If input is not a pd.Series.
         """
         if not isinstance(series, pd.Series):
             raise TypeError(f"{series}, is not pd.DataFrame")
+        if freq_most_least[0] < 0 or freq_most_least[1] < 0:
+            raise ValueError("Tuple values must be >= 0!")
         self.name = series.name
         self.dtype = series.dtype
         self.count = series.count()  # counts non-null values
@@ -288,9 +325,19 @@ def __init__(self, series):
             or isinstance(self.dtype, pd.CategoricalDtype)
         ):
             self.stats = distribution_stats(series)
+        self._format = fmt
+        self._freq_table = freq_most_least
 
-    def __repr__(self):
-        """Printable version of profile."""
+    def __create_tables(self, table_fmt: str):
+        """Create SeriesProfile summary tables.
+
+        Args:
+            table_fmt (str): Tabulate table format name.
+
+        Returns:
+            list(str): List of Tabulate tables.
+
+        """
         series_info = [
             ("Data Type", self.dtype),
             ("Count", self.count),
@@ -300,20 +347,39 @@ def __repr__(self):
         sname = self.name
         if not sname:
             sname = "Series"
-        series_table = tabulate(series_info, headers=[f"{sname} Info", ""])
-        freq_info = _abbreviate_df(self.frequency, first=20, last=5)
-        freq_table = tabulate(freq_info, headers=["Value", "Count", "% of total"])
+        series_table = tabulate(
+            series_info, headers=[f"{sname} Info", ""], tablefmt=table_fmt
+        )
+        freq_info = _abbreviate_df(
+            self.frequency, first=self._freq_table[0], last=self._freq_table[1]
+        )
+        freq_table = tabulate(
+            freq_info, headers=["Value", "Count", "% of total"], tablefmt=table_fmt
+        )
         stats_table = ""
         if self.stats is not None:
             stats = self.stats
             if pat.is_complex_dtype(
                 self.dtype
             ):  # tabulate converts complex numbers to real numbers
                 stats = {k: str(v) for k, v in self.stats.items()}
-            stats_table = tabulate(list(stats.items()), headers=["Statistic", "Value"])
-        output = ["".join([x, "\n\n"]) for x in [series_table, freq_table, stats_table]]
-        output = "".join(output).strip()
-        return output + "\n"
+            stats_table = tabulate(
+                list(stats.items()),
+                headers=["Statistic", "Value"],
+                tablefmt=table_fmt,
+            )
+        return [series_table, freq_table, stats_table]
+
+    def __repr__(self):
+        """Printable version of profile."""
+        output = ["".join([x, "\n\n"]) for x in self.__create_tables(self._format)]
+        return "".join(output).strip() + "\n"
+
+    def _repr_html_(self):
+        """HTML representation of profile."""
+        tables = [_format_html_table(t) for t in self.__create_tables("html")]
+        tables[2] = _decimal_align_col(tables[2], 1)
+        return tables[0] + "<br>" + tables[1] + "<br>" + tables[2]
 
     def save_report(self, path):
         """Save profile to provided path.
@@ -323,3 +389,23 @@ def save_report(self, path):
         """
         with open(path, "w+", encoding="utf-8") as fh:
             fh.write(str(self))
+
+
+def _format_html_table(table: str, align: str = "left", font: str = "monospace") -> str:
+    """Add additional formatting to HTML table prepared by tabulate."""
+    soup = bs4.BeautifulSoup(table, "html.parser")
+    for row in soup.find_all("tr"):
+        tags = row.find_all(["th", "td"])  # row in thead will have 'th'
+        for tag in tags:
+            tag["style"] = f"font-family: {font}, monospace; text-align: {align};"
+    return str(soup)
+
+
+def _decimal_align_col(table: str, col: int):
+    """Create decimal-aligned numbers in column of HTML table."""
+    soup = bs4.BeautifulSoup(table, "html.parser")
+    for row in soup.find_all("tr"):
+        tags = row.find_all("td")
+        if tags:
+            tags[col].string = tags[col].string.replace(" ", "\u2007")  # figure space
+    return str(soup)
diff --git a/tests/test_reports.py b/tests/test_reports.py

Original file line number	Diff line number	Diff line change
`@@ -9,5 +9,5 @@`
`9`	`9`	`)`
`10`	`10`
`11`	`11`
`12`		`-__version__ = "0.0.3"`
	`12`	`+__version__ = "0.0.4"`
`13`	`13`	`__all__ = ["DataFrameProfile", "SeriesProfile", "frequency_table"]`