import pandas as pd

# Load the dataset
df = pd.read_csv('dataset_analysis.csv')

# Display the first few rows of the dataframe to understand its structure
df.head()

# Check for missing values in the key columns
missing_values = df[['perfume_name', 'perfume_brand', 'year', 'gender', 'rating_value', 'rating_count']].isnull().sum()

# Summary statistics for the year, rating_value, and rating_count
year_distribution = df['year'].describe()
rating_value_distribution = df['rating_value'].describe()
rating_count_distribution = df['rating_count'].describe()

# Distribution by gender
gender_distribution = df['gender'].value_counts()

missing_values, year_distribution, rating_value_distribution, rating_count_distribution, gender_distribution

(perfume_name     0
 perfume_brand    0
 year             0
 gender           0
 rating_value     0
 rating_count     0
 dtype: int64,
 count    1000.000000
 mean     2013.652000
 std        12.677445
 min      1792.000000
 25%      2010.000000
 50%      2017.000000
 75%      2021.000000
 max      2024.000000
 Name: year, dtype: float64,
 count    1000.000000
 mean        4.091390
 std         0.229733
 min         3.080000
 25%         3.950000
 50%         4.105000
 75%         4.260000
 max         4.680000
 Name: rating_value, dtype: float64,
 count     1000.000000
 mean      4599.518000
 std       4467.480919
 min        152.000000
 25%       1631.000000
 50%       3015.500000
 75%       5931.250000
 max      28831.000000
 Name: rating_count, dtype: float64,
 gender
 Unisex    367
 Women     354
 Men       279
 Name: count, dtype: int64)

# Calculate the average rating value and count by gender
average_ratings_by_gender = df.groupby('gender')['rating_value'].mean()
average_count_by_gender = df.groupby('gender')['rating_count'].mean()

average_ratings_by_gender, average_count_by_gender

(gender
 Men       4.224158
 Unisex    4.103787
 Women     3.973898
 Name: rating_value, dtype: float64,
 gender
 Men       4882.906810
 Unisex    2901.604905
 Women     6136.435028
 Name: rating_count, dtype: float64)

import matplotlib.pyplot as plt
import seaborn as sns


# 1. Yearly Distribution of Perfumes
yearly_distribution = df['year'].value_counts().sort_index()

# Recalculate the yearly distribution with the corrected data
yearly_distribution_filtered = df['year'].value_counts().sort_index()
# Count the number of fragrances by release year
popularity_distribution_by_year = df['year'].value_counts().sort_index()

import pandas as pd
import plotly.graph_objs as go
import plotly.offline as pyo

df['year'] = df['year'].astype(int)
summary_df = df.groupby('year').agg(total_ratings=('rating_count', 'sum'), count=('perfume_name', 'count')).reset_index()
summary_df['year_str'] = summary_df['year'].astype(str)

# Preparing hover text
hover_text = []
for i, row in summary_df.iterrows():
    hover_text.append(f'Year: {row["year"]}<br>Number of Fragrances: {row["count"]}<br>Total Rating Count: {row["total_ratings"]}')

# Create the bar chart
fig = go.Figure(data=[
    go.Bar(
        x=summary_df['year_str'],
        y=summary_df['count'],
        text=summary_df['count'],  # This will be displayed on the bar
        hoverinfo='text',  # Will show custom text on hover
        hovertext=hover_text,
        marker=dict(color=summary_df['total_ratings'], coloraxis="coloraxis")
    )
])

# Color scale
fig.update_layout(coloraxis=dict(colorscale='Viridis'), title='Distribution of Top 1000 Ranked Fragrances by Release Year')

# Layout adjustments
fig.update_layout(
    title_x=0.5,
    xaxis=dict(
        title='Year of Release',
        type='category',
        tickangle=-45
    ),
    yaxis=dict(
        title='Total Perfumes by Year'
    ),
    bargap=0.1,  # Adjust this for desired bar thickness
    plot_bgcolor='white',
    paper_bgcolor='white'
)

# Adding title above the legend
fig.update_layout(legend_title_text='Total Rating Count')

fig.show()

# Calculating the average rating value and count by release year for the top 1000 fragrances
avg_rating_value_by_year = df.groupby('year')['rating_value'].mean()
avg_rating_count_by_year = df.groupby('year')['rating_count'].mean()

# Setup for dark mode aesthetic
plt.style.use('dark_background')

plt.figure(figsize=(14, 7))

ax1 = sns.lineplot(x=avg_rating_value_by_year.index, y=avg_rating_value_by_year.values, 
                   marker='o', label='Average Rating Value', color='purple', legend=False)

# Create a second y-axis for the count and plot with green color, suppress its automatic legend too
ax2 = plt.twinx()
sns.lineplot(x=avg_rating_count_by_year.index, y=avg_rating_count_by_year.values, 
             marker='o', label='Average Rating Count', color='blue', alpha=0.4, ax=ax2, legend=False)

# Setting titles and labels
plt.title('Average Rating Value and Count by Release Year for Top 1000 Fragrances', color='white')  # Text color
plt.xlabel('Release Year', color='white')
ax1.set_ylabel('Average Rating Value', color='white')
ax2.set_ylabel('Average Rating Count', color='white')

# Fixing the legend to avoid duplicates and set it properly
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper left', frameon=False)  # Frameon=False for no legend background

# Change the tick colors
ax1.tick_params(colors='white', which='both')  # Change the tick colors to white
ax2.tick_params(colors='white', which='both')

# Reset to default style
plt.style.use('default')


plt.tight_layout()
plt.show()

from scipy.stats import pearsonr

# Assuming avg_rating_value_by_year and avg_rating_count_by_year are Series with the same index
correlation_coefficient, p_value = pearsonr(avg_rating_value_by_year.values, avg_rating_count_by_year.values)

print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-Value: {p_value}")

Pearson Correlation Coefficient: -0.4981272019674901
P-Value: 0.00019948059660849693

# Count the total number of fragrances per brand in the top 1000 list
brand_counts = df['perfume_brand'].value_counts().head(10)

# Identify the top brands for further yearly distribution analysis
top_brands = brand_counts.index.tolist()

# Filter the dataset for only top brands and count the number of fragrances per brand per year
top_brands_yearly = df[df['perfume_brand'].isin(top_brands)].groupby(['year', 'perfume_brand']).size().unstack(fill_value=0)

top_brands_yearly.tail(10)  # Display the last 10 years

# Visualizing the data with a heatmap for a clean overview

# Adjusting the year labels on the heatmap to display as integers
years_int = top_brands_yearly.index.astype(int).tolist()

plt.figure(figsize=(14, 8))
# Using the adjusted integer years for the x-axis labels
sns.heatmap(top_brands_yearly.transpose(), cmap="YlGnBu", annot=True, fmt="d", linewidths=.5, xticklabels=years_int)
plt.title('The distribution of top fragrances by brand across selected years')
plt.xlabel('Year')
plt.ylabel('Brand')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Selecting a subset of iconic fashion-forward brands for the visualization
selected_brands = ['Chanel', 'Dior', 'Dolce&Gabbana', 'Giorgio Armani', 'Guerlain', 'Lattafa Perfumes', 'Parfums de Marly', 'Tom Ford', 'Xerjoff', 'Yves Saint Laurent']

# Filtering the dataset for the selected brands and counting the number of fragrances per brand per year
selected_brands_yearly = df[df['perfume_brand'].isin(selected_brands)].groupby(['year', 'perfume_brand']).size().unstack(fill_value=0)


# Ensure df and selected_brands are defined and valid
# Filter the dataset to include only the top 100 fragrances
top_100_df = df[df['rank'] <= 100]

# Count the number of top 100 placements for the selected iconic fashion brands and sort alphabetically
top_100_counts_alphabetical = top_100_df[top_100_df['perfume_brand'].isin(selected_brands)]['perfume_brand'].value_counts().sort_index()

# Generating a color palette with Seaborn
color_palette = sns.color_palette("Paired", len(top_100_counts_alphabetical))

# Generating the pie chart with the color palette applied alphabetically
plt.figure(figsize=(10, 8))
plt.pie(top_100_counts_alphabetical, labels=top_100_counts_alphabetical.index, startangle=100, autopct='%1.1f%%', colors=color_palette)
plt.title('Presence in Top 100 Fragrances List by Brand')
plt.tight_layout()
plt.show()

import plotly.express as px

# Filtering the dataset for Lattafa perfumes
lattafa_perfumes = df[df['perfume_brand'] == 'Lattafa Perfumes']

# Plotly scatter plot
fig = px.scatter(
    lattafa_perfumes,
    x='year',
    y='rating_value',
    size='rating_count',
    color='rating_value',
    hover_name='perfume_name',  # This will show the perfume name
    hover_data=['rank'],  # Add 'rank' to the hover tooltip
    color_continuous_scale=px.colors.diverging.PiYG,  # Using a diverging color scale
    title='Lattafa Perfumes: Current Rating Value and Count Per Release',
    labels={'year': 'Year of Release', 'rating_value': 'Rating Value', 'rating_count': 'Rating Count', 'rank': 'Rank'},
    height=600,  # Height of the figure
    width=1000,  # Width of the figure
    template='plotly_dark'  # Using a dark theme that inverts the typical color scheme
)

fig.update_layout(
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    hovermode='closest',
    legend_title_text='Rating Value',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01,
        bgcolor='rgba(50, 50, 50, 0.5)'  # Semi-transparent legend background
    )
)


fig.show()

	rank	perfume_name	perfume_brand	year	gender	perfumers	rating_value	rating_count	sentiments	main_accords	...	note_woodsy_notes
0	1	Angels' Share	By Kilian	2020.0	Unisex	Benoist Lapouza	4.38	10342	{'love': '100', 'like': '35.8974', 'ok': '12.6...	{'woody': '100', 'warm spicy': '97.3637', 'swe...	...	0
1	2	Khamrah	Lattafa Perfumes	2022.0	Unisex	Unlisted	4.40	6669	{'love': '100', 'like': '39.2081', 'ok': '10.9...	{'sweet': '100', 'warm spicy': '79.0738', 'amb...	...	0
2	3	Le Male Le Parfum	Jean Paul Gaultier	2020.0	Men	Natalie Gracia-Cetto, Quentin Bisch	4.59	10925	{'love': '100', 'like': '24.389', 'ok': '6.433...	{'warm spicy': '100', 'vanilla': '78.7809', 'a...	...	1
3	4	Baccarat Rouge 540	Maison Francis Kurkdjian	2015.0	Unisex	Francis Kurkdjian	3.88	18718	{'love': '100', 'like': '46.4949', 'ok': '19.2...	{'woody': '100', 'amber': '91.8164', 'warm spi...	...	0
4	5	Tobacco Vanille	Tom Ford	2007.0	Unisex	Olivier Gillotin	4.24	20888	{'love': '100', 'like': '49.3364', 'ok': '8.86...	{'vanilla': '100', 'sweet': '94.6943', 'tobacc...	...	0

Fragrance Trends Exploratory Data Analysis¶

Mia M.¶

04/20/2024¶

Overview¶

Overview of the Data Collection Methodology¶

Rating Count Explained¶

Rating Value Insight¶

Commencing Data Exploration¶

Checking for missing values and correcting missing data¶

There were 24 years missing in the data set. Mostly from the brands Armaf and Lattafa Perfumes.¶

I manually filled these in after doing some research and basing the release year on the oldest reviews.¶

Ratings vs. Gender Designation¶

Average Rating Value by Gender¶

Average Rating Count by Gender¶

Visualizing the distribution of the top 1000 fragrances by year¶

The Reflection of Fashion Trends in Fragrance¶

Visualizing the Trend¶

Further Analysis¶

Continuing the analysis¶

Brand Trends¶

New vs. Established Brands¶

Consistency in Popularity¶

Who releases the most fragrances and how do they rank overall?¶

Exploring the perfumes that appear the most with a heat map¶

Next, a pie chart will be used to visualize the presence of these brands in the top 100 ranks.¶

Trends in Fragrance Brand Popularity¶

A Shift in Preference Away from Guerlain¶

Potential Reasons Behind the Changing Trends¶

Upcoming Analysis on Guerlain¶

Rising Popularity¶

Quality and Engagement¶

Market Position and Consumer Perception¶

Conclusion¶

perfume_brand	Chanel	Dior	Dolce&Gabbana	Giorgio Armani	Guerlain	Lattafa Perfumes	Parfums de Marly	Tom Ford	Xerjoff	Yves Saint Laurent
year
2015	1	2	1	1	0	0	1	2	1	1
2016	2	2	0	2	1	2	1	1	0	1
2017	1	2	2	2	2	1	2	2	2	1
2018	2	2	2	1	1	1	2	3	1	1
2019	2	1	1	3	1	0	3	2	3	1
2020	1	2	3	3	1	3	2	4	0	1
2021	0	6	2	4	0	5	3	3	3	4
2022	0	2	1	2	1	8	0	2	0	4
2023	0	0	2	2	3	5	2	3	0	2
2024	0	0	0	0	0	0	0	0	0	1

Fragrance Trends Exploratory Data Analysis¶

Mia M.¶

04/20/2024¶

Overview¶

Overview of the Data Collection Methodology¶

Rating Count Explained¶

Rating Value Insight¶

Commencing Data Exploration¶

Checking for missing values and correcting missing data¶

There were 24 years missing in the data set. Mostly from the brands Armaf and Lattafa Perfumes.¶

I manually filled these in after doing some research and basing the release year on the oldest reviews.¶

Ratings vs. Gender Designation¶

Average Rating Value by Gender¶

Average Rating Count by Gender¶

Visualizing the distribution of the top 1000 fragrances by year¶

The Reflection of Fashion Trends in Fragrance¶

Visualizing the Trend¶

Further Analysis¶

Continuing the analysis¶

Brand Trends¶

New vs. Established Brands¶

Consistency in Popularity¶

Who releases the most fragrances and how do they rank overall?¶

Exploring the perfumes that appear the most with a heat map¶

Next, a pie chart will be used to visualize the presence of these brands in the top 100 ranks.¶

Trends in Fragrance Brand Popularity¶

A Shift in Preference Away from Guerlain¶

Potential Reasons Behind the Changing Trends¶

Upcoming Analysis on Guerlain¶

Rising Popularity¶

Quality and Engagement¶

Market Position and Consumer Perception¶

Conclusion¶

The next step is to find out why Lattafa is trending and how they compare to their top competitors.¶