import pandas as pd
import seaborn as sns                                # for violin plot and for regplot 
import statsmodels.api as sm                         # for linear regression model
from sklearn.model_selection import train_test_split # split data
from scipy import stats                              # for pearsonr value of interaction terms
import folium                                        # for interactive map


df = pd.read_csv('Food Production.csv')
obese_female = pd.read_csv('Obesity Female.csv')
obese_male = pd.read_csv('Obesity Male.csv')
life_exp = pd.read_csv('Life Expectancy.csv')
ed_exp = pd.read_csv('Education Expenditure.csv')
health_exp = pd.read_csv('Health Expenditure.csv')
happiness = pd.read_csv('Happiness Score.csv')

frames = [obese_female, obese_male,
         life_exp, ed_exp, health_exp, happiness]
for f in frames: 
    df = pd.merge(df, f, on = ['Country', 'Year'], how = 'outer')
df.head()


df = df.dropna()
df.head()


ax = sns.violinplot(x='Year', y='Happiness Score', data=df)


x = df[['Food Production Index','Obesity Female', 'Obesity Male',
                 'Life expectancy', 'Education Expenditure', 'Health Expenditure']]
y = df[['Happiness Score']]
model = sm.OLS(y, x).fit()
model.summary()


def interaction(x, y):
    print(stats.pearsonr(df[x], df[y])[0] ** 2)
    ax1 = sns.regplot(x=x, y=y, data=df)


interaction('Obesity Female', 'Obesity Male')

0.642773145901948


interaction('Obesity Female', 'Life expectancy')

0.22785701943344175


interaction('Obesity Female', 'Education Expenditure')

0.0755334498857785


interaction('Obesity Female', 'Health Expenditure')

0.029153557390565694


interaction('Life expectancy', 'Education Expenditure')

0.17668285382109106


interaction('Life expectancy', 'Health Expenditure')

0.44293844398926585


interaction('Education Expenditure', 'Health Expenditure')

0.1736263117050802


interaction('Obesity Male', 'Life expectancy')

0.6416543725734769


interaction('Obesity Male', 'Health Expenditure')

0.30353855358128756


interaction('Obesity Male', 'Education Expenditure')

0.14646904978684944


df['int_obesity'] = df['Obesity Male']*df['Obesity Female']
df['int_life'] = df['Obesity Male']*df['Life expectancy']
df.head()


train, test = train_test_split(df, train_size=0.75, random_state=40)
xtrain = train[['Obesity Female', 'Obesity Male',
                 'Life expectancy', 'Education Expenditure', 'Health Expenditure','int_obesity']]
ytrain = train[['Happiness Score']]


model = sm.OLS(ytrain, xtrain).fit()
model.rsquared

0.991370876357963


xtest = test[['Obesity Female', 'Obesity Male',
                 'Life expectancy', 'Education Expenditure', 'Health Expenditure','int_obesity']]
ytest = test[['Happiness Score']]
predicted = model.predict(xtest)
ax1 = sns.regplot(x=predicted, y=ytest, data=df)


url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
country_shapes = f'{url}/world-countries.json'

m = folium.Map(location=[45.9432, 24.9668], zoom_start=4)
folium.Choropleth(
    geo_data=country_shapes,
    name='World Happiness',
    data=df,
    columns=['Country', 'Happiness Score'],
    key_on='feature.properties.name',
    fill_color='PuRd',
    nan_fill_color='white'
).add_to(m)
m


df['predicted'] = model.predict(x)
m = folium.Map(location=[45.9432, 24.9668], zoom_start=4)
folium.Choropleth(
    geo_data=country_shapes,
    name='World Happiness',
    data=df,
    columns=['Country', 'predicted'],
    key_on='feature.properties.name',
    fill_color='PuRd',
    nan_fill_color='white'
).add_to(m)
m

	Country	Year	Food Production Index	Obesity Female	Obesity Male	Life expectancy	Education Expenditure	Health Expenditure	Happiness Score
0	World	2014	125.601824	14.4	10.4	71.742279	4.62946	1039.40797	NaN
1	East Asia & Pacific	2014	129.655064	NaN	NaN	75.125170	4.15272	615.14134	NaN
2	American Samoa	2014	116.620000	NaN	NaN	NaN	NaN	NaN	NaN
3	American Samoa	2015	120.150000	NaN	NaN	NaN	NaN	NaN	NaN
4	American Samoa	2016	120.970000	NaN	NaN	NaN	NaN	NaN	NaN

	Country	Year	Food Production Index	Obesity Female	Obesity Male	Life expectancy	Education Expenditure	Health Expenditure	Happiness Score
5	Australia	2014	109.14	27.4	28.3	82.30000	5.16477	5637.566895	7.284
6	Australia	2015	109.72	27.9	28.9	82.40000	5.31127	4887.800781	7.313
7	Australia	2016	105.58	28.4	29.6	82.44878	5.27678	4999.810547	7.284
11	Cambodia	2014	176.12	4.3	2.4	68.27300	1.90939	73.299316	3.819
29	Indonesia	2014	140.39	8.1	4.1	70.48100	3.28801	108.837265	5.399

Dep. Variable:	Happiness Score	R-squared (uncentered):	0.991
Model:	OLS	Adj. R-squared (uncentered):	0.991
Method:	Least Squares	F-statistic:	4544.
Date:	Sat, 05 Dec 2020	Prob (F-statistic):	3.08e-255
Time:	16:48:17	Log-Likelihood:	-210.02
No. Observations:	260	AIC:	432.0
Df Residuals:	254	BIC:	453.4
Df Model:	6
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Food Production Index	0.0011	0.001	0.801	0.424	-0.002	0.004
Obesity Female	0.0180	0.008	2.154	0.032	0.002	0.034
Obesity Male	0.0274	0.010	2.815	0.005	0.008	0.047
Life expectancy	0.0526	0.003	15.258	0.000	0.046	0.059
Education Expenditure	0.1083	0.028	3.920	0.000	0.054	0.163
Health Expenditure	0.0002	2.42e-05	7.508	0.000	0.000	0.000

Omnibus:	3.897	Durbin-Watson:	0.737
Prob(Omnibus):	0.143	Jarque-Bera (JB):	3.565
Skew:	-0.218	Prob(JB):	0.168
Kurtosis:	2.627	Cond. No.	2.09e+03

World Happiness

Abigail Protin and Daniela Lulli

Introduction

Libraries Required

Obtaining Data

Tidying the Data

Exploring the Data

Now we're ready to make our model!

Testing our model

Data Visualization

Conclusion