11"""Panda-Helper Classes and associated helper functions."""
22
33from warnings import warn
4+ import bs4
45import numpy as np
56import pandas as pd
67import pandas .api .types as pat
78import scipy .stats
89from tabulate import tabulate
910
10-
1111warn (
1212 "reports module is deprecated and will be removed in a future version."
1313 "'import pandahelper' will provide access to profiles and methods." ,
1616)
1717
1818
19+ def frequency_table (series ):
20+ """Return value counts and relative frequency.
21+
22+ Args:
23+ series (pd.Series): Series used to calculate value counts and relative
24+ frequencies.
25+
26+ Returns:
27+ pd.DataFrame: DataFrame containing values as the row index with value
28+ counts and counts as a percentage of total count.
29+
30+ Raises:
31+ TypeError: If input is not a pd.Series.
32+ """
33+ if not isinstance (series , pd .Series ):
34+ raise TypeError (f"{ series } , is not pd.Series" )
35+ freq = series .value_counts () # excludes nulls
36+ freq .name = "Count"
37+ counts = series .value_counts (normalize = True )
38+ percent = pd .Series ([f"{ x :.2%} " for x in counts ], index = counts .index )
39+ percent .name = "% of Total"
40+ output = pd .concat ([freq , percent ], axis = 1 )
41+ output .index = [_abbreviate_string (str (x ), limit = 60 ) for x in output .index ]
42+ return output .sort_values (by = "Count" , ascending = False )
43+
44+
1945def _abbreviate_df (df , first = 20 , last = 5 ):
2046 """Return a shortened DataFrame or Series.
2147
@@ -161,32 +187,6 @@ def _order_stats(stats: dict):
161187 return {k : stats [k ] for k in key_list }
162188
163189
164- def frequency_table (series ):
165- """Return value counts and relative frequency.
166-
167- Args:
168- series (pd.Series): Series used to calculate value counts and relative
169- frequencies.
170-
171- Returns:
172- pd.DataFrame: DataFrame containing values as the row index with value
173- counts and counts as a percentage of total count.
174-
175- Raises:
176- TypeError: If input is not a pd.Series.
177- """
178- if not isinstance (series , pd .Series ):
179- raise TypeError (f"{ series } , is not pd.Series" )
180- freq = series .value_counts () # excludes nulls
181- freq .name = "Count"
182- counts = series .value_counts (normalize = True )
183- percent = pd .Series ([f"{ x :.2%} " for x in counts ], index = counts .index )
184- percent .name = "% of Total"
185- output = pd .concat ([freq , percent ], axis = 1 )
186- output .index = [_abbreviate_string (str (x ), limit = 60 ) for x in output .index ]
187- return output
188-
189-
190190class DataFrameProfile :
191191 """DataFrame-level data profile.
192192
@@ -199,15 +199,17 @@ class DataFrameProfile:
199199 dtypes (pd.Series): Data types of Series within DataFrame.
200200 num_duplicates (int): Number of duplicated rows.
201201 nulls_per_row (pd.Series): Count of null values per row.
202- nulls_stats (list): Distribution statistics on nulls per row.
202+ null_stats (list): Distribution statistics on nulls per row.
203203 """
204204
205- def __init__ (self , df , name = " " ):
205+ def __init__ (self , df : pd . DataFrame , * , name : str = "" , fmt : str = "simple " ):
206206 """Initialize DataFrameProfile.
207207
208208 Args:
209209 df (pd.DataFrame): DataFrame to profile.
210210 name (str, optional): Name to assign to profile.
211+ fmt (str: optional): Printed table format. See
212+ https://github.com/astanin/python-tabulate for options.
211213
212214 Raises:
213215 TypeError: If input is not a pd.DataFrame.
@@ -220,23 +222,47 @@ def __init__(self, df, name=""):
220222 self .num_duplicates = sum (df .duplicated (keep = "first" ))
221223 self .nulls_per_row = df .isna ().sum (axis = 1 )
222224 self .null_stats = distribution_stats (self .nulls_per_row )
225+ self ._format = fmt
223226
224- def __repr__ (self ):
225- """Printable version of profile."""
227+ def __create_tables (self , table_fmt : str ):
228+ """Create DataFrameProfile summary tables.
229+
230+ Args:
231+ table_fmt (str): Tabulate table format name.
232+
233+ Returns:
234+ list(str): List of Tabulate tables.
235+
236+ """
226237 df_info = [
227238 ("DF Shape" , self .shape ),
228239 ("Duplicated Rows" , self .num_duplicates ),
229240 ]
230241 if self .name :
231242 df_info .insert (0 , ("DF Name" , self .name ))
232- df_table = tabulate (df_info , headers = ["DataFrame-Level Info" , "" ])
233- dtype_table = tabulate (self .dtypes , headers = ["Series Name" , "Data Type" ])
243+ df_table = tabulate (
244+ df_info , headers = ["DataFrame-Level Info" , "" ], tablefmt = table_fmt
245+ )
246+ dtype_table = tabulate (
247+ self .dtypes , headers = ["Series Name" , "Data Type" ], tablefmt = table_fmt
248+ )
234249 null_table = tabulate (
235- list (self .null_stats .items ()), headers = ["Summary of Nulls Per Row" , "" ]
250+ list (self .null_stats .items ()),
251+ headers = ["Summary of Nulls Per Row" , "" ],
252+ tablefmt = table_fmt ,
236253 )
237- output = ["" .join ([x , "\n \n " ]) for x in [df_table , dtype_table , null_table ]]
238- output = "" .join (output ).strip ()
239- return output + "\n "
254+ return [df_table , dtype_table , null_table ]
255+
256+ def __repr__ (self ):
257+ """Printable version of profile."""
258+ output = ["" .join ([x , "\n \n " ]) for x in self .__create_tables (self ._format )]
259+ return "" .join (output ).strip () + "\n "
260+
261+ def _repr_html_ (self ):
262+ """HTML representation of profile."""
263+ tables = [_format_html_table (t ) for t in self .__create_tables ("html" )]
264+ tables [2 ] = _decimal_align_col (tables [2 ], 1 )
265+ return tables [0 ] + "<br>" + tables [1 ] + "<br>" + tables [2 ]
240266
241267 def save_report (self , path ):
242268 """Save profile to provided path.
@@ -260,22 +286,33 @@ class SeriesProfile:
260286 count (int): Count of non-null values.
261287 num_unique (int): Number of unique values.
262288 num_nulls (int): Number of null values.
263- frequency (pd.DataFrame): Table of value counts and relative frequency
264- as a DataFrame
265- stats (list): Distribution statistics for numeric Series.
289+ frequency (pd.DataFrame): Frequency table with counts and percentage.
290+ stats (list): Distribution statistics for Series.
266291 """
267292
268- def __init__ (self , series ):
293+ def __init__ (
294+ self ,
295+ series : pd .Series ,
296+ * ,
297+ fmt : str = "simple" ,
298+ freq_most_least : tuple = (20 , 5 ),
299+ ):
269300 """Initialize SeriesProfile.
270301
271302 Args:
272303 series (pd.Series): DataFrame to profile.
304+ fmt (str: optional): Printed table format. See
305+ https://github.com/astanin/python-tabulate for options.
306+ freq_most_least (tuple: optional): Tuple (x, y) of the x most common and
307+ y least common values to display in frequency table.
273308
274309 Raises:
275310 TypeError: If input is not a pd.Series.
276311 """
277312 if not isinstance (series , pd .Series ):
278313 raise TypeError (f"{ series } , is not pd.DataFrame" )
314+ if freq_most_least [0 ] < 0 or freq_most_least [1 ] < 0 :
315+ raise ValueError ("Tuple values must be >= 0!" )
279316 self .name = series .name
280317 self .dtype = series .dtype
281318 self .count = series .count () # counts non-null values
@@ -288,9 +325,19 @@ def __init__(self, series):
288325 or isinstance (self .dtype , pd .CategoricalDtype )
289326 ):
290327 self .stats = distribution_stats (series )
328+ self ._format = fmt
329+ self ._freq_table = freq_most_least
291330
292- def __repr__ (self ):
293- """Printable version of profile."""
331+ def __create_tables (self , table_fmt : str ):
332+ """Create SeriesProfile summary tables.
333+
334+ Args:
335+ table_fmt (str): Tabulate table format name.
336+
337+ Returns:
338+ list(str): List of Tabulate tables.
339+
340+ """
294341 series_info = [
295342 ("Data Type" , self .dtype ),
296343 ("Count" , self .count ),
@@ -300,20 +347,39 @@ def __repr__(self):
300347 sname = self .name
301348 if not sname :
302349 sname = "Series"
303- series_table = tabulate (series_info , headers = [f"{ sname } Info" , "" ])
304- freq_info = _abbreviate_df (self .frequency , first = 20 , last = 5 )
305- freq_table = tabulate (freq_info , headers = ["Value" , "Count" , "% of total" ])
350+ series_table = tabulate (
351+ series_info , headers = [f"{ sname } Info" , "" ], tablefmt = table_fmt
352+ )
353+ freq_info = _abbreviate_df (
354+ self .frequency , first = self ._freq_table [0 ], last = self ._freq_table [1 ]
355+ )
356+ freq_table = tabulate (
357+ freq_info , headers = ["Value" , "Count" , "% of total" ], tablefmt = table_fmt
358+ )
306359 stats_table = ""
307360 if self .stats is not None :
308361 stats = self .stats
309362 if pat .is_complex_dtype (
310363 self .dtype
311364 ): # tabulate converts complex numbers to real numbers
312365 stats = {k : str (v ) for k , v in self .stats .items ()}
313- stats_table = tabulate (list (stats .items ()), headers = ["Statistic" , "Value" ])
314- output = ["" .join ([x , "\n \n " ]) for x in [series_table , freq_table , stats_table ]]
315- output = "" .join (output ).strip ()
316- return output + "\n "
366+ stats_table = tabulate (
367+ list (stats .items ()),
368+ headers = ["Statistic" , "Value" ],
369+ tablefmt = table_fmt ,
370+ )
371+ return [series_table , freq_table , stats_table ]
372+
373+ def __repr__ (self ):
374+ """Printable version of profile."""
375+ output = ["" .join ([x , "\n \n " ]) for x in self .__create_tables (self ._format )]
376+ return "" .join (output ).strip () + "\n "
377+
378+ def _repr_html_ (self ):
379+ """HTML representation of profile."""
380+ tables = [_format_html_table (t ) for t in self .__create_tables ("html" )]
381+ tables [2 ] = _decimal_align_col (tables [2 ], 1 )
382+ return tables [0 ] + "<br>" + tables [1 ] + "<br>" + tables [2 ]
317383
318384 def save_report (self , path ):
319385 """Save profile to provided path.
@@ -323,3 +389,23 @@ def save_report(self, path):
323389 """
324390 with open (path , "w+" , encoding = "utf-8" ) as fh :
325391 fh .write (str (self ))
392+
393+
394+ def _format_html_table (table : str , align : str = "left" , font : str = "monospace" ) -> str :
395+ """Add additional formatting to HTML table prepared by tabulate."""
396+ soup = bs4 .BeautifulSoup (table , "html.parser" )
397+ for row in soup .find_all ("tr" ):
398+ tags = row .find_all (["th" , "td" ]) # row in thead will have 'th'
399+ for tag in tags :
400+ tag ["style" ] = f"font-family: { font } , monospace; text-align: { align } ;"
401+ return str (soup )
402+
403+
404+ def _decimal_align_col (table : str , col : int ):
405+ """Create decimal-aligned numbers in column of HTML table."""
406+ soup = bs4 .BeautifulSoup (table , "html.parser" )
407+ for row in soup .find_all ("tr" ):
408+ tags = row .find_all ("td" )
409+ if tags :
410+ tags [col ].string = tags [col ].string .replace (" " , "\u2007 " ) # figure space
411+ return str (soup )
0 commit comments