import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

data = pd.read_csv(r"C:\Users\DELL\Desktop\Oooof\mbti.csv")


[p.split('|||') for p in data.head(2).posts.values]

[replacing a large number of posts with this line to save time]


#Distribution of the MBTI types
cnt_types = data['type'].value_counts()

plt.figure(figsize=(10,5))
sns.barplot(cnt_types.index, cnt_types.values, alpha=1)
plt.ylabel('Number of Occurrences', fontsize=10)
plt.xlabel('Types', fontsize=10)
plt.show()

-


#Adding columns for the type Indicators
def get_types(row):
    t=row['type']

    I = 0; N = 0
    T = 0; J = 0
    
    if t[0] == 'I': I = 1
    elif t[0] == 'E': I = 0
    else: print('I-E incorrect')
        
    if t[1] == 'N': N = 1
    elif t[1] == 'S': N = 0
    else: print('N-S incorrect')
        
    if t[2] == 'T': T = 1
    elif t[2] == 'F': T = 0
    else: print('T-F incorrect')
        
    if t[3] == 'J': J = 1
    elif t[3] == 'P': J = 0
    else: print('J-P incorrect')
    return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J }) 

data = data.join(data.apply (lambda row: get_types (row),axis=1))


N = 4
but = (data['IE'].value_counts()[0], data['NS'].value_counts()[0], data['TF'].value_counts()[0], data['JP'].value_counts()[0])
top = (data['IE'].value_counts()[1], data['NS'].value_counts()[1], data['TF'].value_counts()[1], data['JP'].value_counts()[1])

ind = np.arange(N)   
width = 0.9    

p1 = plt.bar(ind, but, width)
p2 = plt.bar(ind, top, width, bottom=but)

plt.ylabel('Count')
plt.title('Distribution accoss types indicators')
plt.xticks(ind, ('I/E',  'N/S', 'T/F', 'J/P',))

plt.show()


#Binarize Type Indicator 
b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]

def translate_personality(personality):
    return [b_Pers[l] for l in personality]

def translate_back(personality):    
    s = ""
    for i, l in enumerate(personality):
        s += b_Pers_list[i][l]
    return s


#Preprocessing

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk import word_tokenize

#removing MTBI types
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
  
unique_type_list = [x.lower() for x in unique_type_list]


#Lemmatize
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

cachedStopWords = stopwords.words("english")

def pre_process_data(data, remove_stop_words=True, remove_mbti_profiles=True):

    list_personality = []
    list_posts = []
    len_data = len(data)
    i=0
    
    for row in data.iterrows():
        i+=1
        if (i % 500 == 0 or i == 1 or i == len_data):
            print("%s of %s rows" % (i, len_data))

    
        posts = row[1].posts
        temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', posts)
        temp = re.sub("[^a-zA-Z]", " ", temp)
        temp = re.sub(' +', ' ', temp).lower()
        if remove_stop_words:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in cachedStopWords])
        else:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
            
        if remove_mbti_profiles:
            for t in unique_type_list:
                temp = temp.replace(t,"")

        type_labelized = translate_personality(row[1].type)
        list_personality.append(type_labelized)
        list_posts.append(temp)

    list_posts = np.array(list_posts)
    list_personality = np.array(list_personality)
    return list_posts, list_personality


list_posts, list_personality  = pre_process_data(data, remove_stop_words=True)

1 of 8675 rows
500 of 8675 rows
1000 of 8675 rows
1500 of 8675 rows
2000 of 8675 rows
2500 of 8675 rows
3000 of 8675 rows
3500 of 8675 rows
4000 of 8675 rows
4500 of 8675 rows
5000 of 8675 rows
5500 of 8675 rows
6000 of 8675 rows
6500 of 8675 rows
7000 of 8675 rows
7500 of 8675 rows
8000 of 8675 rows
8500 of 8675 rows
8675 of 8675 rows


#Vectorizing using count and tf-idf by reducing the frequency to 10-70%
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

cntizer = CountVectorizer(analyzer="word", 
                             max_features=1500, 
                             tokenizer=None,    
                             preprocessor=None, 
                             stop_words=None,  
                             max_df=0.7,
                             min_df=0.1) 

print("CountVectorizer...")
X_cnt = cntizer.fit_transform(list_posts)

tfizer = TfidfTransformer()

print("Tf-idf...")
X_tfidf =  tfizer.fit_transform(X_cnt).toarray()

CountVectorizer...
Tf-idf...


feature_names = list(enumerate(cntizer.get_feature_names()))
feature_names

[replacing a 790 word features with this line to save time]


type_indicators = [ "IE: Introversion (I) – Extroversion (E)", "NS: Intuition (N) – Sensing (S)", 
                   "FT: Feeling (F) – Thinking (T)", "JP: Judging (J) – Perceiving (P)"  ]

for l in range(len(type_indicators)):
    print(type_indicators[l])

IE: Introversion (I) – Extroversion (E)
NS: Intuition (N) – Sensing (S)
FT: Feeling (F) – Thinking (T)
JP: Judging (J) – Perceiving (P)


#FInding optimal hyperparameters 
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

X = X_tfidf

#setup parameters for xgboost
param = {}
param['n_estimators'] = 200
param['max_depth'] = 2
param['nthread'] = 8
param['learning_rate'] = 0.2


for l in range(len(type_indicators)):
    print("%s ..." % (type_indicators[l]))
    
    Y = list_personality[:,l]
    model = XGBClassifier(**param)
    # earning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
    # param_grid = dict(learning_rate=learning_rate)
    
    param_grid = {
        'n_estimators' : [ 200, 300],
        'learning_rate': [ 0.2, 0.3]
        # 'learning_rate': [ 0.01, 0.1, 0.2, 0.3],
        # 'max_depth': [2,3,4],
    }
    
    
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
    grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
    grid_result = grid_search.fit(X, Y)

    # summarize results
    print("* Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("* %f (%f) with: %r" % (mean, stdev, param))

IE: Introversion (I) – Extroversion (E) ...

-

NS: Intuition (N) – Sensing (S) ...

-

* Best: -0.474715 using {'learning_rate': 0.2, 'n_estimators': 200}
* -0.474715 (0.012865) with: {'learning_rate': 0.2, 'n_estimators': 200}
* -0.527921 (0.017557) with: {'learning_rate': 0.2, 'n_estimators': 300}
* -0.540049 (0.020257) with: {'learning_rate': 0.3, 'n_estimators': 200}
* -0.594415 (0.024021) with: {'learning_rate': 0.3, 'n_estimators': 300}
FT: Feeling (F) – Thinking (T) ...

-

* Best: -0.554554 using {'learning_rate': 0.2, 'n_estimators': 200}
* -0.554554 (0.026379) with: {'learning_rate': 0.2, 'n_estimators': 200}
* -0.575055 (0.031195) with: {'learning_rate': 0.2, 'n_estimators': 300}
* -0.603686 (0.040521) with: {'learning_rate': 0.3, 'n_estimators': 200}
* -0.637024 (0.045305) with: {'learning_rate': 0.3, 'n_estimators': 300}
JP: Judging (J) – Perceiving (P) ...

-

* Best: -0.674219 using {'learning_rate': 0.2, 'n_estimators': 200}
* -0.674219 (0.012776) with: {'learning_rate': 0.2, 'n_estimators': 200}
* -0.703747 (0.018876) with: {'learning_rate': 0.2, 'n_estimators': 300}
* -0.747239 (0.022701) with: {'learning_rate': 0.3, 'n_estimators': 200}
* -0.801555 (0.029589) with: {'learning_rate': 0.3, 'n_estimators': 300}


#XGBoost model
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = X_tfidf

param = {}

param['n_estimators'] = 200
param['max_depth'] = 3
param['nthread'] = 8
param['learning_rate'] = 0.1

for l in range(len(type_indicators)):
    print("%s ..." % (type_indicators[l]))
    
    Y = list_personality[:,l]

    seed = 7
    test_size = 0.33
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    
#make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    
#evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))

IE: Introversion (I) – Extroversion (E) ...

* IE: Introversion (I) – Extroversion (E) Accuracy: 78.76%
NS: Intuition (N) – Sensing (S) ...

-

* NS: Intuition (N) – Sensing (S) Accuracy: 85.92%
FT: Feeling (F) – Thinking (T) ...

-

* FT: Feeling (F) – Thinking (T) Accuracy: 73.00%
JP: Judging (J) – Perceiving (P) ...

-

* JP: Judging (J) – Perceiving (P) Accuracy: 65.91%


from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.659098847362906


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[ 420,  699],
       [ 277, 1467]], dtype=int64)

MIDWAY

Brief Analysis of References we used

Improving Intelligent Personality Prediction using Myers-Briggs Type Indicator and Random Forest Classifier