# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

loans = pd.read_csv('data/prosperLoanData.csv')

loans.shape

(113937, 81)

loans.head()

loan_df = loans[['Term', 'BorrowerAPR', 'LoanOriginalAmount', 
                 'ProsperRating (Alpha)','ListingCategory (numeric)', 
                 'EmploymentStatus','CreditScoreRangeLower',
                 'CreditScoreRangeUpper', 'DebtToIncomeRatio', 
                 'StatedMonthlyIncome',  'MonthlyLoanPayment']].copy()
loan_df.head()

#Look at descriptive statistics for Borrower APR
loan_df['BorrowerAPR'].describe()

count    113912.000000
mean          0.218828
std           0.080364
min           0.006530
25%           0.156290
50%           0.209760
75%           0.283810
max           0.512290
Name: BorrowerAPR, dtype: float64

bins = np.arange(0, loan_df['BorrowerAPR'].max()+0.00625, 0.00625)

plt.figure(figsize=[8, 5])
sns.histplot(data=loan_df, x='BorrowerAPR', bins=bins)
plt.xlabel('Borrower APR');

#fill na values with 0
loan_df['CreditScoreRangeLower'].fillna(0, inplace=True)
loan_df['CreditScoreRangeUpper'].fillna(0, inplace=True)

#create new variable to hold midpoint of range
loan_df['CreditScoreRange'] = (loan_df['CreditScoreRangeLower']+loan_df['CreditScoreRangeUpper'])/2

#create bins for each credit range
bins = [300, 580, 670, 740, 800, 851, np.inf] 
credit_labels = ['<300', '300 - 579', '580 - 669', '670 - 739', '740 - 799', '800 - 850']

# use pd.cut() to apply the bins to the CreditScoreRange variable
loan_df['CreditScoreRange'] = pd.cut(loan_df['CreditScoreRange'], bins=bins, labels=credit_labels, right=False) 

#convert to ordered categorical data type
loan_df['CreditScoreRange'] = pd.Categorical(loan_df['CreditScoreRange'], categories=credit_labels, ordered=True)
loan_df.drop(columns={'CreditScoreRangeLower', 'CreditScoreRangeUpper'}, inplace=True)

plt.figure(figsize=[8, 5])
sns.countplot(data=loan_df, x='CreditScoreRange')
plt.xlabel('Credit Score Range')
plt.xticks(rotation=15);

bins = np.arange(0, loan_df['MonthlyLoanPayment'].max()+25, 25)

plt.figure(figsize=[8, 5])
sns.histplot(data=loan_df, x='MonthlyLoanPayment', bins=bins)
plt.xlabel('Monthly Loan Payment ($)');

# result has a long tail in the distribution, so I'm going to use a log scale
log_binsize = 0.0125
bins = 10 ** np.arange(2.4, np.log10(loan_df['MonthlyLoanPayment'].max())+log_binsize, log_binsize)

plt.figure(figsize=[8, 5])
sns.histplot(data = loan_df, x = 'MonthlyLoanPayment', bins = bins)
plt.xscale('log')
plt.xticks([500, 1e3, 2.5e3], [500, '1k', '2.5k'])
plt.xlabel('Monthly Loan Payment ($)');

bins = np.arange(10, loan_df['Term'].max()+1, 1)

plt.figure(figsize=[8, 5])
sns.histplot(data=loan_df, x='Term', bins=bins)
plt.xlabel('Loan Term (in months)');

#create plots for each variable
fig, ax = plt.subplots(nrows=3, figsize = [8,10])

sns.histplot(data=loan_df, x='LoanOriginalAmount', ax=ax[0])
sns.histplot(data=loan_df, x='DebtToIncomeRatio', ax=ax[1])
sns.histplot(data=loan_df, x='StatedMonthlyIncome', ax=ax[2]);

#replot debt to income ratio with log scale

#create bins
log_binsize = 0.05
bins = 10 ** np.arange(-2.4, np.log10(loan_df['DebtToIncomeRatio'].max())+log_binsize, log_binsize)

sns.histplot(data=loan_df, x='DebtToIncomeRatio', bins=bins)
plt.xscale('log')
plt.xticks([0.01, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10], [0.01, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10])
plt.xlabel('Debt to Income Ratio');

#replot stated monthly income with log scale

#create bins
log_binsize = 0.05
bins = 10 ** np.arange(2, np.log10(loan_df['StatedMonthlyIncome'].max())+log_binsize, log_binsize)

sns.histplot(data=loan_df, x='StatedMonthlyIncome', bins=bins)
plt.xscale('log')
plt.xticks([100, 500, 1000, 5000, 10000, 50000, 100000], [100, 500, '1k', '5k', '10k', '50k', '100k'])
plt.xlabel('Stated Monthly Income ($)');

#fix datatypes of categorical variables
#first: ProsperRating -- should be in order of the rating with AA being the highest
loan_df['ProsperRating (Alpha)'].fillna('N/A', inplace=True)
ratings = ['N/A', 'HR', 'E', 'D', 'C', 'B', 'A', 'AA']

loan_df['ProsperRating (Alpha)'] = pd.Categorical(loan_df['ProsperRating (Alpha)'], categories=ratings, ordered=True)

#ListingCategory changed to string type
#replaced numeric values with what they represent for easier readability
loan_df['ListingCategory (numeric)'] = loan_df['ListingCategory (numeric)'].astype(str)
category_map = {'0':'Not Available', '1':'Debt Consolidation', 
                '2':'Home Improvement', '3':'Business', '4':'Personal Loan',
                '5':'Student Use', '6':'Auto', '7':'Other', '8':'Baby & Adoption',
                '9':'Boat', '10':'Cosmetic Procedure', '11':'Engagement Ring',
                '12':'Green Loans', '13':'Household Expenses', '14':'Large Purchases',
                '15':'Medical/Dental', '16':'Motorcycle', '17':'RV', '18':'Taxes',
                '19':'Vacation', '20':'Wedding Loans'}

loan_df['ListingCategory (numeric)'] = loan_df['ListingCategory (numeric)'].map(category_map)
#Renaming this column, as the listing categories are no longer numeric
loan_df = loan_df.rename(columns={'ListingCategory (numeric)': 'ListingCategory'})

#filling null values in EmploymentStatus to match the corresponding label
loan_df['EmploymentStatus'].fillna('Not Available', inplace=True)

#create plots for each variable
fig, ax = plt.subplots(nrows=3, figsize = [5,15])

sns.countplot(data=loan_df, x='ProsperRating (Alpha)', ax=ax[0])
sns.countplot(data=loan_df, y='ListingCategory', ax=ax[1])
ax[1].tick_params(axis='y', labelsize=8)
sns.countplot(data=loan_df, y='EmploymentStatus', ax=ax[2]);

#assigning numeric vars and categoric vars to lists that I can use later on
numeric_vars = ['Term', 'BorrowerAPR', 'LoanOriginalAmount', 
                'DebtToIncomeRatio', 'StatedMonthlyIncome', 
                'MonthlyLoanPayment']
categoric_vars = ['CreditScoreRange', 'ProsperRating (Alpha)',
                  'ListingCategory', 'EmploymentStatus']

# correlation plot
plt.figure(figsize = [5, 5])
sns.heatmap(loan_df[numeric_vars].corr(), annot = True, fmt = '.3f',
           cmap = 'coolwarm', center = 0, vmin=-1);

g = sns.PairGrid(data = loan_df, vars = numeric_vars)
g = g.map_diag(plt.hist, bins = 20);
g.map_offdiag(plt.scatter);

# plot BorrowerAPR against categorical features

plt.figure(figsize = [10, 10])
g = sns.PairGrid(loan_df, x_vars=categoric_vars, y_vars=['BorrowerAPR'], height=4)
g.map(sns.boxplot)
for ax in g.axes.flat:
    ax.tick_params(axis='x', labelrotation=90)

<Figure size 1000x1000 with 0 Axes>

# getting a closer look at BorrowerAPR vs credit score range and prosper rating
plt.figure()
g = sns.PairGrid(loan_df, x_vars=categoric_vars[:2], y_vars=['BorrowerAPR'], height=5)
g.map(sns.violinplot)
for ax in g.axes.flat:
    ax.tick_params(axis='x', labelrotation=90)

<Figure size 640x480 with 0 Axes>

# faceted heatmap plotting BorrowerAPR against *LoanOriginalAmount, broken out by CreditScoreRange
g = sns.FacetGrid(loan_df, col='CreditScoreRange', col_wrap=3)
g.map_dataframe(sns.histplot, x='BorrowerAPR', y='LoanOriginalAmount', bins=15,cmap='inferno_r');

# filtering out rows with no prosper rating to remove irrelevant data from the plot
filtered_df = loan_df[loan_df['ProsperRating (Alpha)'] != 'N/A'].copy()
filtered_df['ProsperRating (Alpha)'] = pd.Categorical(filtered_df['ProsperRating (Alpha)'], categories = ['HR', 'E', 'D', 'C', 'B', 'A', 'AA'])

# subplots to examine APR vs Loan Amount, using varying third features
fig, ax = plt.subplots(3, 1, figsize = [8,15])

# Credit Score Range
sns.scatterplot(loan_df, x='BorrowerAPR', y='LoanOriginalAmount', hue='CreditScoreRange', s=15,
                edgecolor=None, palette='viridis_r', alpha=0.5, ax=ax[0])
# Prosper Rating
sns.scatterplot(filtered_df, x='BorrowerAPR', y='LoanOriginalAmount', hue='ProsperRating (Alpha)', s=15,
                edgecolor=None, palette='viridis_r', alpha=0.5, ax=ax[1])
# Debt to Income Ratio
sns.scatterplot(loan_df, x='BorrowerAPR', y='LoanOriginalAmount', hue='DebtToIncomeRatio', s=15,
                edgecolor=None, palette='viridis_r', alpha=0.5, ax=ax[2]);

# save modified dataset to data folder
loan_df.to_csv('data/loanDataModified.csv', index=False)

	ListingKey	ListingNumber	ListingCreationDate	CreditGrade	Term	LoanStatus	ClosedDate	BorrowerAPR	BorrowerRate	LenderYield	...	LP_ServiceFees	PercentFunded	Investors
0	1021339766868145413AB3B	193129	2007-08-26 19:09:29.263000000	C	36	Completed	2009-08-14 00:00:00	0.16516	0.1580	0.1380	...	-133.18	1.0	258
1	10273602499503308B223C1	1209647	2014-02-27 08:28:07.900000000	NaN	36	Current	NaN	0.12016	0.0920	0.0820	...	0.00	1.0	1
2	0EE9337825851032864889A	81716	2007-01-05 15:00:47.090000000	HR	36	Completed	2009-12-17 00:00:00	0.28269	0.2750	0.2400	...	-24.20	1.0	41
3	0EF5356002482715299901A	658116	2012-10-22 11:02:35.010000000	NaN	36	Current	NaN	0.12528	0.0974	0.0874	...	-108.01	1.0	158
4	0F023589499656230C5E3E2	909464	2013-09-14 18:38:39.097000000	NaN	36	Current	NaN	0.24614	0.2085	0.1985	...	-60.27	1.0	20

	Term	BorrowerAPR	LoanOriginalAmount	ProsperRating (Alpha)	ListingCategory (numeric)	EmploymentStatus	CreditScoreRangeLower	CreditScoreRangeUpper	DebtToIncomeRatio	StatedMonthlyIncome	MonthlyLoanPayment
0	36	0.16516	9425	NaN	0	Self-employed	640.0	659.0	0.17	3083.333333	330.43
1	36	0.12016	10000	A	2	Employed	680.0	699.0	0.18	6125.000000	318.93
2	36	0.28269	3001	NaN	0	Not available	480.0	499.0	0.06	2083.333333	123.32
3	36	0.12528	10000	A	16	Employed	800.0	819.0	0.15	2875.000000	321.45
4	36	0.24614	15000	D	2	Employed	680.0	699.0	0.26	9583.333333	563.97

Part I - Prosper Loan Data Exploration¶

by April LaRosa¶

Table of Contents¶

Introduction¶

Preliminary Wrangling¶

What is the structure of your dataset?¶

What is/are the main feature(s) of interest in your dataset?¶

What features in the dataset do you think will help support your investigation into your feature(s) of interest?¶

Univariate Exploration¶

Discuss the distribution(s) of your variable(s) of interest. Were there any unusual points? Did you need to perform any transformations?¶

Of the features you investigated, were there any unusual distributions? Did you perform any operations on the data to tidy, adjust, or change the form of the data? If so, why did you do this?¶

Bivariate Exploration¶

Talk about some of the relationships you observed in this part of the investigation. How did the feature(s) of interest vary with other features in the dataset?¶

Did you observe any interesting relationships between the other features (not the main feature(s) of interest)?¶

Multivariate Exploration¶

Talk about some of the relationships you observed in this part of the investigation. Were there features that strengthened each other in terms of looking at your feature(s) of interest?¶

Were there any interesting or surprising interactions between features?¶

Conclusions¶