import pandas as pd
import numpy as np
from tabulate import tabulate # for table summary
import scipy.stats as stats
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm # for lowess smoothing
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, col, pow, mean, avg, when, log, sqrt, exp
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
= SparkSession.builder.master("local[*]").getOrCreate() spark
Introduction
In this post, we’ll take a look at a dataset containing information about household Ben & Jerry’s ice cream purchases. First, we’ll analyze the data using descriptive statistics, filtering, and group operations. After that, we’ll be able to build a simple linear regression model to predict the price per serving of Ben & Jerry’s ice cream based on different household characteristics.
Loading Packages, UDFS, andDataFrame
Before we get started, we must load the packages and UDFs necessary for our analysis and Linear Regression.
def add_dummy_variables(var_name, reference_level, category_order=None):
"""
Creates dummy variables for the specified column in the global DataFrames dtrain and dtest.
Allows manual setting of category order.
Parameters:
var_name (str): The name of the categorical column (e.g., "borough_name").
reference_level (int): Index of the category to be used as the reference (dummy omitted).
category_order (list, optional): List of categories in the desired order. If None, categories are sorted.
Returns:
dummy_cols (list): List of dummy column names excluding the reference category.
ref_category (str): The category chosen as the reference.
"""
global dtrain, dtest
# Get distinct categories from the training set.
= dtrain.select(var_name).distinct().rdd.flatMap(lambda x: x).collect()
categories
# Convert booleans to strings if present.
= [str(c) if isinstance(c, bool) else c for c in categories]
categories
# Use manual category order if provided; otherwise, sort categories.
if category_order:
# Ensure all categories are present in the user-defined order
= set(categories) - set(category_order)
missing if missing:
raise ValueError(f"These categories are missing from your custom order: {missing}")
= category_order
categories else:
= sorted(categories)
categories
# Validate reference_level
if reference_level < 0 or reference_level >= len(categories):
raise ValueError(f"reference_level must be between 0 and {len(categories) - 1}")
# Define the reference category
= categories[reference_level]
ref_category print("Reference category (dummy omitted):", ref_category)
# Create dummy variables for all categories
for cat in categories:
= var_name + "_" + str(cat).replace(" ", "_")
dummy_col_name = dtrain.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0))
dtrain = dtest.withColumn(dummy_col_name, when(col(var_name) == cat, 1).otherwise(0))
dtest
# List of dummy columns, excluding the reference category
= [var_name + "_" + str(cat).replace(" ", "_") for cat in categories if cat != ref_category]
dummy_cols
return dummy_cols, ref_category
# Example usage without category_order:
# dummy_cols_year, ref_category_year = add_dummy_variables('year', 0)
# Example usage with category_order:
# custom_order_wkday = ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']
# dummy_cols_wkday, ref_category_wkday = add_dummy_variables('wkday', reference_level=0, category_order = custom_order_wkday)
def regression_table(model, assembler):
"""
Creates a formatted regression table from a fitted LinearRegression model and its VectorAssembler,
and inserts a dashed horizontal line after the Intercept row. The table includes separate columns
for the 95% confidence interval lower and upper bounds for each coefficient (computed at the 5% significance level)
and an "Observations" row (using model.summary.numInstances) above the R² row.
The RMSE row is placed as the last row.
The columns are ordered as:
Metric | Value | Significance | Std. Error | p-value | 95% CI Lower | 95% CI Upper
For the "Value", "Std. Error", "95% CI Lower", and "95% CI Upper" columns, commas are inserted every three digits,
with 3 decimal places (except for Observations which is formatted as an integer with commas).
Parameters:
model: A fitted LinearRegression model (with a .summary attribute).
assembler: The VectorAssembler used to assemble the features for the model.
Returns:
A formatted string containing the regression table.
"""
# Extract coefficients and standard errors as NumPy arrays
= model.coefficients.toArray()
coeffs
= np.array(model.summary.coefficientStandardErrors)
std_errors_all
# Check if the intercept's standard error is included (one extra element)
if len(std_errors_all) == len(coeffs) + 1:
= std_errors_all[0]
intercept_se = std_errors_all[1:]
std_errors else:
= None
intercept_se = std_errors_all
std_errors
# Compute t-statistics for feature coefficients (t = beta / SE(beta))
# t_stats = coeffs / std_errors
= model.summary.tValues
t_stats
# Degrees of freedom: number of instances minus number of predictors minus 1 (for intercept)
= model.summary.numInstances - len(coeffs) - 1
df
# Compute the t-critical value for a 95% confidence interval (two-tailed, 5% significance)
= stats.t.ppf(0.975, df)
t_critical
# Compute two-tailed p-values for each feature coefficient
# p_values = [2 * (1 - stats.t.cdf(np.abs(t), df)) for t in t_stats]
= model.summary.pValues
p_values
# Function to assign significance stars based on p-value
def significance_stars(p):
if p < 0.01:
return "***"
elif p < 0.05:
return "**"
elif p < 0.1:
return "*"
else:
return ""
# Build the table rows.
# Order: Metric, Value, Significance, Std. Error, p-value, 95% CI Lower, 95% CI Upper.
= []
table for feature, beta, se, p in zip(assembler.getInputCols(), coeffs, std_errors, p_values):
= beta - t_critical * se
ci_lower = beta + t_critical * se
ci_upper
table.append(["Beta: " + feature, # Metric name
# Beta estimate (Value)
beta, # Significance stars
significance_stars(p), # Standard error
se, # p-value
p, # 95% CI lower bound
ci_lower, # 95% CI upper bound
ci_upper
])
# Compute and add the intercept row with its SE, p-value, significance, and CI (if available)
if intercept_se is not None:
= model.intercept / intercept_se
intercept_t = 2 * (1 - stats.t.cdf(np.abs(intercept_t), df))
intercept_p = significance_stars(intercept_p)
intercept_sig = model.intercept - t_critical * intercept_se
ci_intercept_lower = model.intercept + t_critical * intercept_se
ci_intercept_upper else:
= ""
intercept_se = ""
intercept_p = ""
intercept_sig = ""
ci_intercept_lower = ""
ci_intercept_upper
table.append(["Intercept",
model.intercept,
intercept_sig,
intercept_se,
intercept_p,
ci_intercept_lower,
ci_intercept_upper
])
# Append overall model metrics:
# Insert an Observations row using model.summary.numInstances,
# then an R² row, and finally the RMSE row as the last row.
"Observations", model.summary.numInstances, "", "", "", "", ""])
table.append(["R²", model.summary.r2, "", "", "", "", ""])
table.append(["RMSE", model.summary.rootMeanSquaredError, "", "", "", "", ""])
table.append([
# Format the table.
# For the "Value" (index 1), "Std. Error" (index 3), "95% CI Lower" (index 5), and "95% CI Upper" (index 6) columns,
# format with commas and 3 decimal places, except for Observations which should be an integer with commas.
# For the p-value (index 4), format to 3 decimal places.
= []
formatted_table for row in table:
= []
formatted_row for i, item in enumerate(row):
if row[0] == "Observations" and i == 1 and isinstance(item, (int, float, np.floating)) and item != "":
# Format Observations as integer with commas, no decimals.
f"{int(item):,}")
formatted_row.append(elif isinstance(item, (int, float, np.floating)) and item != "":
if i in [1, 3, 5, 6]:
f"{item:,.3f}")
formatted_row.append(elif i == 4:
f"{item:.3f}")
formatted_row.append(else:
f"{item:.3f}")
formatted_row.append(else:
formatted_row.append(item)
formatted_table.append(formatted_row)
# Generate the table string using tabulate.
= tabulate(
table_str
formatted_table,=["Metric", "Value", "Sig.", "Std. Error", "p-value", "95% CI Lower", "95% CI Upper"],
headers="pretty",
tablefmt=("left", "right", "center", "right", "right", "right", "right")
colalign
)
# Insert a dashed line after the Intercept row for clarity.
= table_str.split("\n")
lines = '-' * len(lines[0])
dash_line for i, line in enumerate(lines):
if "Intercept" in line and not line.strip().startswith('+'):
+1, dash_line)
lines.insert(ibreak
return "\n".join(lines)
= pd.read_csv('https://bcdanl.github.io/data/ben-and-jerry-cleaned.csv') ice_cream
ice_cream
priceper1 | flavor_descr | size1_descr | household_id | household_income | household_size | usecoup | couponper1 | region | married | race | hispanic_origin | microwave | dishwasher | sfh | internet | tvcable | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.41 | CAKE BATTER | 16.0 MLOZ | 2001456 | 130000 | 2 | True | 0.5 | Central | False | white | False | True | False | False | True | True |
1 | 3.50 | VAN CARAMEL FUDGE | 16.0 MLOZ | 2001456 | 130000 | 2 | False | 0.0 | Central | False | white | False | True | False | False | True | True |
2 | 3.50 | VAN CARAMEL FUDGE | 16.0 MLOZ | 2001456 | 130000 | 2 | False | 0.0 | Central | False | white | False | True | False | False | True | True |
3 | 3.00 | W-N-C-P-C | 16.0 MLOZ | 2001637 | 70000 | 1 | False | 0.0 | West | False | white | False | True | True | True | False | True |
4 | 3.99 | AMERICONE DREAM | 16.0 MLOZ | 2002791 | 130000 | 3 | False | 0.0 | South | True | white | False | True | True | True | True | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
21969 | 3.34 | DUBLIN MUDSLIDE | 16.0 MLOZ | 9171249 | 80000 | 4 | False | 0.0 | South | True | white | False | True | True | True | True | True |
21970 | 1.99 | PHISH FOOD | 16.0 MLOZ | 9171249 | 80000 | 4 | False | 0.0 | South | True | white | False | True | True | True | True | True |
21971 | 4.99 | VAN | 16.0 MLOZ | 9176214 | 80000 | 1 | False | 0.0 | East | False | white | False | True | False | False | True | True |
21972 | 3.50 | VAN | 16.0 MLOZ | 9176214 | 80000 | 1 | False | 0.0 | East | False | white | False | True | False | False | True | True |
21973 | 3.50 | VAN | 16.0 MLOZ | 9176214 | 80000 | 1 | False | 0.0 | East | False | white | False | True | False | False | True | True |
21974 rows × 17 columns
'tvcable'] = ice_cream['tvcable'].astype(bool) ice_cream[
= spark.createDataFrame(ice_cream) df
Descriptive Statistics
df.describe().show()
+-------+------------------+---------------+-----------+--------------------+------------------+------------------+-------------------+-------+-----+
|summary| priceper1| flavor_descr|size1_descr| household_id| household_income| household_size| couponper1| region| race|
+-------+------------------+---------------+-----------+--------------------+------------------+------------------+-------------------+-------+-----+
| count| 21974| 21974| 21974| 21974| 21974| 21974| 21974| 21974|21974|
| mean| 3.314627108010865| NULL| NULL|1.6612005039000638E7|125290.79821607354|2.4564030217529806|0.12557882800885908| NULL| NULL|
| stddev|0.6656263892402016| NULL| NULL|1.1685954458195915E7| 57188.36322324326|1.3368209496554313| 0.5178886507790131| NULL| NULL|
| min| 0.0|AMERICONE DREAM| 16.0 MLOZ| 2000358| 40000| 1| 0.0|Central|asian|
| max| 9.48| WHITE RUSSIAN| 32.0 MLOZ| 30440689| 310000| 9| 8.98| West|white|
+-------+------------------+---------------+-----------+--------------------+------------------+------------------+-------------------+-------+-----+
The descriptive statistics give us a summary of the central tendencies and variation for the continuous variables in the dataset. This gives us an idea of how much Ben & Jerry’s ice cream normally costs and the characteristics of each household. For example, this helps us to understand the range of houshold income and household size.
Counting and Filtering
Next, let’s perform some basic counting and filtering operations on the variables we’ll be working with.
= ice_cream['usecoup'].value_counts()
coupon_usage print(coupon_usage)
usecoup
False 19629
True 2345
Name: count, dtype: int64
The above counting operation shows us how many households used a coupon on their ice cream purchase. In this case, 2,345 households used a coupon.
= ice_cream[ice_cream['usecoup'] == True]
coupon_used = coupon_used['priceper1'].mean()
avg_price_coupon_used print(avg_price_coupon_used)
3.3771737739872068
For households that used a coupon, the average price of the ice cream they purchased was $3.38.
= ice_cream[ice_cream['usecoup'] == False]
coupon_not_used = coupon_not_used['priceper1'].mean()
avg_price_coupon_not_used print(avg_price_coupon_not_used)
3.307154902003595
For households that did not use a coupon, the average price of the ice cream they purchased was $3.31.
Group Operations
We can use group operations to analyze how different variables are related to each other. For example, we can look at how the price of ice cream changes across different types of households. Let’s take a look at how price of ice cream changes based on geography.
= ice_cream.groupby('region')['priceper1'].mean()
price_by_region print(price_by_region)
region
Central 3.277892
East 3.422322
South 3.258359
West 3.325974
Name: priceper1, dtype: float64
On average, Ben & Jerry’s ice cream is the least expensive in the South region and most expensive in the East region.
Similarly, we can group the data by household income to analyze how price changes based on income.
= ice_cream.groupby('household_income')['priceper1'].mean()
price_by_income print(price_by_income)
household_income
40000 3.343187
50000 3.344026
60000 3.456909
70000 3.402600
80000 3.318014
110000 3.306350
130000 3.325455
150000 3.318129
160000 3.266847
170000 3.214524
180000 3.263123
190000 3.257739
210000 3.316303
230000 3.223304
240000 3.324874
260000 3.205166
280000 3.445214
300000 3.135817
310000 3.205312
Name: priceper1, dtype: float64
Finally, we can group the data by household size. This may give us some insights into whether or not household size impacts how much the household spends on ice cream.
= ice_cream.groupby('household_size')['priceper1'].mean()
price_by_size print(price_by_size)
household_size
1 3.362255
2 3.312518
3 3.295934
4 3.273378
5 3.331149
6 3.254255
7 3.248901
8 3.149375
9 3.041636
Name: priceper1, dtype: float64
On average, it looks like larger households tend to spend less on ice cream than smaller households.
Linear Regression
Next, let’s build a linear regression model using the Ben & Jerry’s dataset to predict the price per serving of ice cream. We will use household income, household size, and coupon usage as predictor variables.
= df.randomSplit([.67, .33], seed = 1234) dtrain, dtest
= ['household_income', 'household_size', 'usecoup']
x_cols_1
= VectorAssembler(inputCols=x_cols_1, outputCol="predictors")
assembler_1
= assembler_1.transform(dtrain)
dtrain_1 = assembler_1.transform(dtest)
dtest_1
# training model
= (
model_1 ="predictors",
LinearRegression(featuresCol="priceper1")
labelCol
.fit(dtrain_1)
)
# making prediction
= model_1.transform(dtrain_1)
dtrain_1 = model_1.transform(dtest_1) dtest_1
print(regression_table(model_1, assembler_1))
+------------------------+--------+------+------------+---------+--------------+--------------+
| Metric | Value | Sig. | Std. Error | p-value | 95% CI Lower | 95% CI Upper |
+------------------------+--------+------+------------+---------+--------------+--------------+
| Beta: household_income | -0.000 | *** | 0.004 | 0.000 | -0.008 | 0.008 |
| Beta: household_size | -0.028 | *** | 0.018 | 0.000 | -0.063 | 0.007 |
| Beta: usecoup | 0.064 | *** | 0.019 | 0.000 | 0.026 | 0.101 |
| Intercept | 3.476 | *** | 0.000 | 0.000 | 3.476 | 3.476 |
-----------------------------------------------------------------------------------------------
| Observations | 14,734 | | | | | |
| R² | 0.007 | | | | | |
| RMSE | 0.666 | | | | | |
+------------------------+--------+------+------------+---------+--------------+--------------+
This model helps us analyze how household income, household size, and coupon usage affect the price of Ben & Jerry’s ice cream. The coefficients, or beta values, show us to what extent each feature predicts or influences the price.
The coefficient for household income is 0. This means that an increase or decrease in household income does not affect the price of ice cream. The coefficient for household size is -0.028. This means that for every additional person in the household, the ice cream price per serving decreases by approximately 0.03. Perhaps larger households tend to buy larger amounts of ice cream that have a lower unit price. The coefficient for coupon usage is 0.064. This shows that if a coupon was used, the price per serving of ice cream increases by approximately 0.06. This does not make much sense because coupons usually reduce the price of the item purchased. A possible explanation for this is that households could be using coupons on more expensive, specialty ice cream products. Coupon usage seems to have the most impact on price per serving because it has the largest coefficient value.
Conclusion
By analyzing this dataset, we were able to show how different household characteristics, such as household income, household size, and coupon usage, affect and predict the price per serving of Ben & Jerry’s ice cream. We also applied a linear regression model, which helped us further understand the relationship between these household characteristics and ice cream price. This information could be useful in marketing, as Ben & Jerry’s may want to tailor promotions to different income levels, household sizes, or other household characteristics.