Appendix
PS C:\Users\Admin > pyspark #Start PySpark
>>> #Create SparkSession
>>> from pyspark.sql import SparkSession
>>> spark = SparkSession.builder \
... .appName("CustomerPurchaseAnalysis") \
... .master("local[*]") \
... .getOrCreate()
>>> #Read CSV File
>>> df = spark.read \
... .option("header", "true") \
... .option("inferSchema", "true") \
... .csv("C:/Users/Admin/Desktop/customer_purchases_large.csv")
>>> #Show Data
>>> df.show()
+-----------+--------+----------------+---------------+--------+---------+-------------+
|Customer_ID|Order_ID|Product_Category|Purchase_Amount|Quantity| City|Purchase_Date|
+-----------+--------+----------------+---------------+--------+---------+-------------+
| C0001| O1001| Home & Kitchen| 4667.38| 1| Mumbai| 10-05-2025|
| C0002| O1002| Books| 2869.15| 3|Hyderabad| 08-02-2025|
| C0003| O1003| Groceries| 3513.59| 8| Chennai| 23-03-2025|
| C0004| O1004| Books| 4620.25| 3| Kolkata| 09-07-2025|
| C0005| O1005| Books| 3565.47| 8| Chennai| 10-08-2025|
| C0006| O1006| Fashion| 847.44| 1| Chennai| 31-03-2025|
| C0007| O1007| Groceries| 2923.81| 5| Delhi| 22-01-2025|
| C0008| O1008| Groceries| 3072.9| 4| Mumbai| 23-05-2025|
| C0009| O1009| Groceries| 2178.24| 5| Kolkata| 27-03-2025|
| C0010| O1010| Books| 3708.58| 8|Hyderabad| 22-03-2025|
| C0011| O1011| Home & Kitchen| 4678.4| 4| Delhi| 17-12-2025|
| C0012| O1012| Groceries| 4635.29| 4|Bangalore| 13-08-2025|
| C0013| O1013| Books| 2309.11| 3|Hyderabad| 23-07-2025|
| C0014| O1014| Fashion| 654.87| 7| Pune| 01-06-2025|
| C0015| O1015| Home & Kitchen| 4925.72| 3|Hyderabad| 06-06-2025|
| C0016| O1016| Fashion| 4210.6| 3|Hyderabad| 17-07-2025|
| C0017| O1017| Home & Kitchen| 710.85| 5| Chennai| 12-09-2025|
| C0018| O1018| Books| 4612.13| 5|Hyderabad| 14-09-2025|
| C0019| O1019| Electronics| 4362.49| 6| Pune| 04-03-2025|
| C0020| O1020| Home & Kitchen| 2642.31| 8| Pune| 02-07-2025|
+-----------+--------+----------------+---------------+--------+---------+-------------+
only showing top 20 rows
>>> #Drop NULL Values
>>> df = df.na.drop()
>>> df.show(10)
+-----------+--------+----------------+---------------+--------+---------+-------------+
|Customer_ID|Order_ID|Product_Category|Purchase_Amount|Quantity| City|Purchase_Date|
+-----------+--------+----------------+---------------+--------+---------+-------------+
| C0001| O1001| Home & Kitchen| 4667.38| 1| Mumbai| 10-05-2025|
| C0002| O1002| Books| 2869.15| 3|Hyderabad| 08-02-2025|
| C0003| O1003| Groceries| 3513.59| 8| Chennai| 23-03-2025|
| C0004| O1004| Books| 4620.25| 3| Kolkata| 09-07-2025|
| C0005| O1005| Books| 3565.47| 8| Chennai| 10-08-2025|
| C0006| O1006| Fashion| 847.44| 1| Chennai| 31-03-2025|
| C0007| O1007| Groceries| 2923.81| 5| Delhi| 22-01-2025|
| C0008| O1008| Groceries| 3072.9| 4| Mumbai| 23-05-2025|
| C0009| O1009| Groceries| 2178.24| 5| Kolkata| 27-03-2025|
| C0010| O1010| Books| 3708.58| 8|Hyderabad| 22-03-2025|
+-----------+--------+----------------+---------------+--------+---------+-------------+
only showing top 10 rows
>>> #Round-up Purchase_Amount
>>> from pyspark.sql.functions import ceil, col
>>> df = df.withColumn("Purchase_Amount", ceil(col("Purchase_Amount")))
>>> df.show(10)
+-----------+--------+----------------+---------------+--------+---------+-------------+
|Customer_ID|Order_ID|Product_Category|Purchase_Amount|Quantity| City|Purchase_Date|
+-----------+--------+----------------+---------------+--------+---------+-------------+
| C0001| O1001| Home & Kitchen| 4668| 1| Mumbai| 10-05-2025|
| C0002| O1002| Books| 2870| 3|Hyderabad| 08-02-2025|
| C0003| O1003| Groceries| 3514| 8| Chennai| 23-03-2025|
| C0004| O1004| Books| 4621| 3| Kolkata| 09-07-2025|
| C0005| O1005| Books| 3566| 8| Chennai| 10-08-2025|
| C0006| O1006| Fashion| 848| 1| Chennai| 31-03-2025|
| C0007| O1007| Groceries| 2924| 5| Delhi| 22-01-2025|
| C0008| O1008| Groceries| 3073| 4| Mumbai| 23-05-2025|
| C0009| O1009| Groceries| 2179| 5| Kolkata| 27-03-2025|
| C0010| O1010| Books| 3709| 8|Hyderabad| 22-03-2025|
+-----------+--------+----------------+---------------+--------+---------+-------------+
only showing top 10 rows
>>> #Category-wise Total Sales
>>> from pyspark.sql.functions import sum, desc
>>> salescategory = df.groupBy("Product_Category") \
... .agg(sum("Purchase_Amount").alias("Total Sales")) \
... .orderBy(desc("Total Sales"))
>>> salescategory.show()
+----------------+-----------+
|Product_Category|Total Sales|
+----------------+-----------+
| Home & Kitchen| 546981|
| Electronics| 518446|
| Books| 512708|
| Fashion| 501020|
| Groceries| 472555|
+----------------+-----------+
>>> #City-wise Total Sales
>>> citysales = df.groupBy("City") \
... .agg(sum("Purchase_Amount").alias("Citywise Sales")) \
... .orderBy(desc("Citywise Sales"))
>>> citysales.show()
+---------+--------------+
| City|Citywise Sales|
+---------+--------------+
|Bangalore| 405336|
| Mumbai| 397773|
| Delhi| 371322|
| Chennai| 361986|
|Hyderabad| 350619|
| Pune| 346648|
| Kolkata| 318026|
+---------+--------------+
>>> #Convert Spark DataFrames to Pandas
>>> sales_pd = salescategory.toPandas()
>>> city_pd = citysales.toPandas()
>>> sales_pd
Product_Category Total Sales
0 Home & Kitchen 546981
1 Electronics 518446
2 Books 512708
3 Fashion 501020
4 Groceries 472555
>>> city_pd
City Citywise Sales
0 Bangalore 405336
1 Mumbai 397773
2 Delhi 371322
3 Chennai 361986
4 Hyderabad 350619
5 Pune 346648
6 Kolkata 318026
>>> #Import Matplotlib for Data Visualization
>>> import matplotlib.pyplot as plt
>>> #Visualization Category-wise Total Sales
>>> colors = ["skyblue", "orange", "green", "red", "purple"]
>>> plt.figure()
>>> plt.bar(sales_pd["Product_Category"],sales_pd["Total Sales"],color = colors)
>>> plt.xlabel("Product Category")
>>> plt.ylabel("Total Sales")
>>> plt.title("Category-wise Total Sales")
>>> plt.xticks(rotation = 30)
>>> plt.tight_layout()
>>> plt.show()
>>> #Visualization City-wise Total Sales
>>> plt.figure()
>>> plt.pie(city_pd["Citywise Sales"],labels = city_pd["City"],
... autopct="%1.1f%%",
... startangle = 140,
... colors = colors)
>>> plt.title("City-wise Sales Distribution")
>>> plt.axis("equal")
>>> plt.tight_layout()
>>> plt.show()
>>> #City-wise Category Sales
>>> from pyspark.sql.functions import sum
>>> city_category_sales = df.groupBy("City", "Product_Category") \
... .agg(sum("Purchase_Amount").alias("Total Sales"))
>>> city_category_sales.show();
+---------+----------------+-----------+
| City|Product_Category|Total Sales|
+---------+----------------+-----------+
|Hyderabad| Electronics| 73265|
| Pune| Groceries| 77325|
| Delhi| Fashion| 66492|
| Chennai| Electronics| 57436|
|Hyderabad| Books| 97599|
| Pune| Fashion| 79559|
| Mumbai| Home & Kitchen| 100141|
|Bangalore| Books| 75360|
|Bangalore| Home & Kitchen| 80712|
|Bangalore| Groceries| 70320|
| Chennai| Groceries| 91337|
| Pune| Electronics| 71600|
|Bangalore| Fashion| 85423|
| Mumbai| Groceries| 97226|
| Pune| Home & Kitchen| 69783|
| Mumbai| Fashion| 92283|
| Pune| Books| 48381|
| Chennai| Books| 72969|
| Delhi| Books| 78316|
|Hyderabad| Fashion| 66998|
+---------+----------------+-----------+
only showing top 20 rows
>>> #Convert to Pandas & Pivot
>>> cc_pd = city_category_sales.toPandas()
>>> pivot_df = cc_pd.pivot(index="City", columns="Product_Category", values="Total_Sales").fillna(0)
>>> pivot_df.plot(kind="bar", stacked = True)
>>> plt.xlabel("City")
>>> plt.ylabel("Total Sales")
>>> plt.title("City-wise Category Sales")
>>> plt.xticks(rotation = 30)
>>> plt.tight_layout()
>>> plt.show()
>>> pivot_df.plot(kind="bar")
>>> plt.xlabel("City")
>>> plt.ylabel("Total Sales")
>>> plt.title("City-wise Category Sales (Grouped Bar Chart)")
>>> plt.xticks(rotation = 30)
>>> plt.tight_layout()
>>> plt.show()