Personality Prediction Using Machine Learning
By Thejas and Anujith
Introduction
MBTI is a test that uses a questionnaire to classify the personality using 4 scales, into 16. Even though MBTI is criticized as pseudo-science, it can be applied broadly to personalize Ads or to recommend videos in an OTT platform.
The shortcoming of a traditional MBTI test are-
Plan
Dataset
Midway expectation
Work division
References
Predicting Myers-Briggs Type Indicator with Text Classification
POST MIDWAY
Recalling everything we have done till midway
For pre-processing
Work after Midway
Conclusion and shortcomings
Code
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv(r"C:\Users\DELL\Desktop\Oooof\mbti.csv")
[p.split('|||') for p in data.head(2).posts.values]
[replacing a large number of posts with this line to save time]
#Distribution of the MBTI types
cnt_types = data['type'].value_counts()
plt.figure(figsize=(10,5))
sns.barplot(cnt_types.index, cnt_types.values, alpha=1)
plt.ylabel('Number of Occurrences', fontsize=10)
plt.xlabel('Types', fontsize=10)
plt.show()
-
#Adding columns for the type Indicators
def get_types(row):
t=row['type']
I = 0; N = 0
T = 0; J = 0
if t[0] == 'I': I = 1
elif t[0] == 'E': I = 0
else: print('I-E incorrect')
if t[1] == 'N': N = 1
elif t[1] == 'S': N = 0
else: print('N-S incorrect')
if t[2] == 'T': T = 1
elif t[2] == 'F': T = 0
else: print('T-F incorrect')
if t[3] == 'J': J = 1
elif t[3] == 'P': J = 0
else: print('J-P incorrect')
return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J })
data = data.join(data.apply (lambda row: get_types (row),axis=1))
N = 4
but = (data['IE'].value_counts()[0], data['NS'].value_counts()[0], data['TF'].value_counts()[0], data['JP'].value_counts()[0])
top = (data['IE'].value_counts()[1], data['NS'].value_counts()[1], data['TF'].value_counts()[1], data['JP'].value_counts()[1])
ind = np.arange(N)
width = 0.9
p1 = plt.bar(ind, but, width)
p2 = plt.bar(ind, top, width, bottom=but)
plt.ylabel('Count')
plt.title('Distribution accoss types indicators')
plt.xticks(ind, ('I/E', 'N/S', 'T/F', 'J/P',))
plt.show()
#Binarize Type Indicator
b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]
def translate_personality(personality):
return [b_Pers[l] for l in personality]
def translate_back(personality):
s = ""
for i, l in enumerate(personality):
s += b_Pers_list[i][l]
return s
#Preprocessing
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
#removing MTBI types
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
unique_type_list = [x.lower() for x in unique_type_list]
#Lemmatize
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
cachedStopWords = stopwords.words("english")
def pre_process_data(data, remove_stop_words=True, remove_mbti_profiles=True):
list_personality = []
list_posts = []
len_data = len(data)
i=0
for row in data.iterrows():
i+=1
if (i % 500 == 0 or i == 1 or i == len_data):
print("%s of %s rows" % (i, len_data))
posts = row[1].posts
temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', posts)
temp = re.sub("[^a-zA-Z]", " ", temp)
temp = re.sub(' +', ' ', temp).lower()
if remove_stop_words:
temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in cachedStopWords])
else:
temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
if remove_mbti_profiles:
for t in unique_type_list:
temp = temp.replace(t,"")
type_labelized = translate_personality(row[1].type)
list_personality.append(type_labelized)
list_posts.append(temp)
list_posts = np.array(list_posts)
list_personality = np.array(list_personality)
return list_posts, list_personality
list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
1 of 8675 rows 500 of 8675 rows 1000 of 8675 rows 1500 of 8675 rows 2000 of 8675 rows 2500 of 8675 rows 3000 of 8675 rows 3500 of 8675 rows 4000 of 8675 rows 4500 of 8675 rows 5000 of 8675 rows 5500 of 8675 rows 6000 of 8675 rows 6500 of 8675 rows 7000 of 8675 rows 7500 of 8675 rows 8000 of 8675 rows 8500 of 8675 rows 8675 of 8675 rows
#Vectorizing using count and tf-idf by reducing the frequency to 10-70%
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
cntizer = CountVectorizer(analyzer="word",
max_features=1500,
tokenizer=None,
preprocessor=None,
stop_words=None,
max_df=0.7,
min_df=0.1)
print("CountVectorizer...")
X_cnt = cntizer.fit_transform(list_posts)
tfizer = TfidfTransformer()
print("Tf-idf...")
X_tfidf = tfizer.fit_transform(X_cnt).toarray()
CountVectorizer... Tf-idf...
feature_names = list(enumerate(cntizer.get_feature_names()))
feature_names
[replacing a 790 word features with this line to save time]
type_indicators = [ "IE: Introversion (I) – Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
"FT: Feeling (F) – Thinking (T)", "JP: Judging (J) – Perceiving (P)" ]
for l in range(len(type_indicators)):
print(type_indicators[l])
IE: Introversion (I) – Extroversion (E) NS: Intuition (N) – Sensing (S) FT: Feeling (F) – Thinking (T) JP: Judging (J) – Perceiving (P)
#FInding optimal hyperparameters
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
X = X_tfidf
#setup parameters for xgboost
param = {}
param['n_estimators'] = 200
param['max_depth'] = 2
param['nthread'] = 8
param['learning_rate'] = 0.2
for l in range(len(type_indicators)):
print("%s ..." % (type_indicators[l]))
Y = list_personality[:,l]
model = XGBClassifier(**param)
# earning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
# param_grid = dict(learning_rate=learning_rate)
param_grid = {
'n_estimators' : [ 200, 300],
'learning_rate': [ 0.2, 0.3]
# 'learning_rate': [ 0.01, 0.1, 0.2, 0.3],
# 'max_depth': [2,3,4],
}
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, Y)
# summarize results
print("* Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("* %f (%f) with: %r" % (mean, stdev, param))
IE: Introversion (I) – Extroversion (E) ...
-
NS: Intuition (N) – Sensing (S) ...
-
* Best: -0.474715 using {'learning_rate': 0.2, 'n_estimators': 200} * -0.474715 (0.012865) with: {'learning_rate': 0.2, 'n_estimators': 200} * -0.527921 (0.017557) with: {'learning_rate': 0.2, 'n_estimators': 300} * -0.540049 (0.020257) with: {'learning_rate': 0.3, 'n_estimators': 200} * -0.594415 (0.024021) with: {'learning_rate': 0.3, 'n_estimators': 300} FT: Feeling (F) – Thinking (T) ...
-
* Best: -0.554554 using {'learning_rate': 0.2, 'n_estimators': 200} * -0.554554 (0.026379) with: {'learning_rate': 0.2, 'n_estimators': 200} * -0.575055 (0.031195) with: {'learning_rate': 0.2, 'n_estimators': 300} * -0.603686 (0.040521) with: {'learning_rate': 0.3, 'n_estimators': 200} * -0.637024 (0.045305) with: {'learning_rate': 0.3, 'n_estimators': 300} JP: Judging (J) – Perceiving (P) ...
-
* Best: -0.674219 using {'learning_rate': 0.2, 'n_estimators': 200} * -0.674219 (0.012776) with: {'learning_rate': 0.2, 'n_estimators': 200} * -0.703747 (0.018876) with: {'learning_rate': 0.2, 'n_estimators': 300} * -0.747239 (0.022701) with: {'learning_rate': 0.3, 'n_estimators': 200} * -0.801555 (0.029589) with: {'learning_rate': 0.3, 'n_estimators': 300}
#XGBoost model
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = X_tfidf
param = {}
param['n_estimators'] = 200
param['max_depth'] = 3
param['nthread'] = 8
param['learning_rate'] = 0.1
for l in range(len(type_indicators)):
print("%s ..." % (type_indicators[l]))
Y = list_personality[:,l]
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier(**param)
model.fit(X_train, y_train)
#make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
#evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
IE: Introversion (I) – Extroversion (E) ... * IE: Introversion (I) – Extroversion (E) Accuracy: 78.76% NS: Intuition (N) – Sensing (S) ...
-
* NS: Intuition (N) – Sensing (S) Accuracy: 85.92% FT: Feeling (F) – Thinking (T) ...
-
* FT: Feeling (F) – Thinking (T) Accuracy: 73.00% JP: Judging (J) – Perceiving (P) ...
-
* JP: Judging (J) – Perceiving (P) Accuracy: 65.91%
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
0.659098847362906
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)
array([[ 420, 699], [ 277, 1467]], dtype=int64)