Python Modularity2

2022-08-03

8주차 : Python Modularity

Understanding the Python Modularity Script
- Please download the new titanic data on the Teams channel, there are two data, train.csv and test.csv
- Please find the python script for 7th week, week7-problem.py
- Open the file and please understand for each sections/steps
- Run the script on your local computer
- Ge some insights !
Change the script !
- Previously, you already learn about python main function: if __name__ == ‘__main__’
- Your task is to change the script by adding a main function ! (see on the next page)
- Please save your work with different filename

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

... 

# Importing the dataset
train = pd.read_csv("D:/FURANZU/CODE/dataset/train.csv")
test = pd.read_csv("D:/FURANZU/CODE/dataset/test.csv")

# Age categories.
def process_age(df, cut_points, label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["AgeCategory"] = pd.cut(df["Age"], cut_points, labels=label_name)

    return df

cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", "Infant", "Child", "Teenager", "YoungAdult", "Adult", "Senior"]

train = process_age(train, cut_points, label_names)
test = process_age(test, cut_points, label_names)

...
...

pickle.dump(
    best_model[1],
    open(f"{best_model[0].lower().replace(' ', '_')}_classifier.model",
    "wb"))

(no main function)

Change to ⬇

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

... 

# Importing the dataset
train = pd.read_csv("D:/FURANZU/CODE/dataset/train.csv")
test = pd.read_csv("D:/FURANZU/CODE/dataset/test.csv")

# Age categories.
def process_age(df, cut_points, label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["AgeCategory"] = pd.cut(df["Age"], cut_points, labels=label_name)

    return df

cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", "Infant", "Child", "Teenager", "YoungAdult", "Adult", "Senior"]

train = process_age(train, cut_points, label_names)
test = process_age(test, cut_points, label_names)

...
...

pickle.dump(
    best_model[1],
    open(f"{best_model[0].lower().replace(' ', '_')}_classifier.model",
    "wb"))

if __name__ == ‘__main__’:
   (your code here !)

Task:

In the current script, you MAY NOT found if __name__ == ‘__main__’ function. Please add main function on the script (HINT: you may change some parts into the main function, so the structure of the code will be changed) Please refer to this page as your reference.

풀이

# -*- coding: utf-8 -*-
"""
Created on Fri Jul 22 18:42:56 2022

@author: franz
"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

class Model:
    classifier_type = [ "Logistic Regression", "KNN", "SVM", "Kernel SVM", "Gaussian Naive Bayes", "Decision Tree",
            "Random Forest", "Gradient Boost" ]

    best_accuracy = 0
    best_model = None

    # 나이에 따라 분류
    def process_age(self, df, cut_points, label_names):
        # fillna 는 결측값을 채우는 함수 입니다.
        # 나이를 -0.5 로 채우고 있는것 같아 보임;
        df["Age"] = df["Age"].fillna(-0.5)

        # cut 은 분류를 해주는 것, 가령 나이에 따라 young, old 등으로
        df["AgeCategory"] = pd.cut(df["Age"], cut_points, labels=label_names)

        return df

    # 리스트 안에 title 명 이 있는지 없는지 구분
    def titles_in_name(self, name: str, titles: list):
        for title in titles:
            if title in name:
                return title
        return np.nan

    # title 명에 따라 Mr, Mrs, Miss 분류
    def categorize_titles(self, person):
        title = person["Title"]

        if title in ["Don", "Major", "Capt", "Jonkheer", "Rev", "Col"]:
            return "Mr"
        elif title in ["Countess", "Mme"]:
            return "Mrs"
        elif title in ["Mlle", "Ms"]:
            return "Miss"
        elif title in ["Dr"]:
            if person["Sex"] == "Male":
                return "Mr"
            else:
                return "Mrs"
        else:
            return title

    def add_encoded_columns(self, df, column):
        # get_dummies 는 전처리 함수 입니다.
        dummies = pd.get_dummies(df[column], prefix = column)
        # 동일한 데이터 속성? 구조? 인 경우 합쳐줍니다. sql 에서 union 느낌인거 같습니다.
        df = pd.concat([df, dummies], axis = 1)

        return df

    #########################################################################################################

    def __init__(self):
        cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
        label_names = ["Missing", "Infant", "Child", "Teenager", "YoungAdult", "Adult", "Senior"]
        train = pd.read_csv("./data/train.csv")
        test = pd.read_csv("./data/test.csv")

        self.train = self.process_age(train, cut_points, label_names)
        self.test = self.process_age(test, cut_points, label_names)
        self.title_list = ["Mrs", "Mr", "Master", "Miss", "Major", "Rev",
                        "Dr", "Ms", "Mlle","Col", "Capt", "Mme", "Countess",
                        "Don", "Jonkheer"]
        self.categorical_features = ["AgeCategory", "Sex", "Embarked", "Pclass", "Title"]
        self.columns = ["Fare", "AgeCategory_Infant", "AgeCategory_Child", "AgeCategory_Teenager", "AgeCategory_YoungAdult", "AgeCategory_Adult", "AgeCategory_Senior", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S", "Pclass_1", "Pclass_2", "Pclass_3", "FamilySize", "Title_Mr", "Title_Mrs", "Title_Miss", "Title_Master"]
        self.classifiers = {
            "Logistic Regression": LogisticRegression(random_state = 0, solver="lbfgs", max_iter = 10000),
            "KNN": KNeighborsClassifier(n_neighbors = 5, metric = "minkowski", p = 2),
            "SVM": SVC(kernel = "linear", random_state = 0),
            "Kernel SVM": SVC(kernel = "rbf", random_state = 0),
            "Gaussian Naive Bayes": GaussianNB(),
            "Decision Tree": DecisionTreeClassifier(criterion = "entropy", random_state = 0),
            "Random Forest": RandomForestClassifier(criterion = "entropy", n_estimators = 100, random_state = 0),
            "Gradient Boost": GradientBoostingClassifier()
        }

    def feature_engineering(self):
        self.train["FamilySize"] = self.train["SibSp"] + self.train["Parch"] + 1
        self.test["FamilySize"] = self.test["SibSp"] + self.test["Parch"] + 1
    
    def title(self):
        self.train["Title"] = self.train["Name"].map(lambda x: self.titles_in_name(x, self.title_list))
        self.test["Title"] = self.test["Name"].map(lambda x: self.titles_in_name(x, self.title_list))

        self.train["Title"] = self.train.apply(self.categorize_titles, axis=1)
        self.test["Title"] = self.test.apply(self.categorize_titles, axis=1)

    def encode(self):
        for feature in self.categorical_features:
            self.train = self.add_encoded_columns(self.train, feature)
            self.test = self.add_encoded_columns(self.test, feature)

    def fillna(self):
        self.train["Fare"] = self.train["Fare"].fillna((self.train["Fare"].mean()))
        self.test["Fare"] = self.test["Fare"].fillna((self.test["Fare"].mean()))

    def export_csv(self, prediction, type = "Logistic Regression"):
        holdout = self.test
        holdout_ids = holdout["PassengerId"]
        submission_df = {"PassengerId": holdout_ids,
                        "Survived": prediction}

        submission = pd.DataFrame(submission_df)
            
        # export prediction results to csv
        outname = f'titanic_{type}.csv'
        outdir = './prediction'
        filename = os.path.join(outdir, outname)
        
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        
        submission.to_csv(filename, index=False)

    def prepare_set(self):
        columns = ["Fare", "AgeCategory_Infant", "AgeCategory_Child", "AgeCategory_Teenager", "AgeCategory_YoungAdult", "AgeCategory_Adult", "AgeCategory_Senior", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S", "Pclass_1", "Pclass_2", "Pclass_3", "FamilySize", "Title_Mr", "Title_Mrs", "Title_Miss", "Title_Master"]
        self.X_all = self.train[columns]
        self.y_all = self.train["Survived"]

    def predict(self, type = "Logistic Regression"):
        columns = ["Fare", "AgeCategory_Infant", "AgeCategory_Child", "AgeCategory_Teenager", "AgeCategory_YoungAdult", "AgeCategory_Adult", "AgeCategory_Senior", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S", "Pclass_1", "Pclass_2", "Pclass_3", "FamilySize", "Title_Mr", "Title_Mrs", "Title_Miss", "Title_Master"]
        holdout = self.test

        classifier = self.classifiers[type]

        print(f"\n--- {type} ---")
        scores = cross_val_score(classifier, self.X_all, self.y_all, cv = 10)
        accuracy = np.mean(scores)
        min = np.min(scores)
        max = np.max(scores)

        if accuracy > Model.best_accuracy:
            Model.best_accuracy = accuracy
            Model.best_model = (type, classifier)

        print(f"\nAccuracy: {accuracy}\nMin: {min}\nMax: {max}\n")
        print("Fitting on all data, predicting test data...\n")
        
        # train and predict
        classifier.fit(self.X_all, self.y_all)
        return classifier.predict(holdout[columns])

# main function
if __name__ == '__main__':
    model_instance = Model()

    # train model
    model_instance.feature_engineering()
    model_instance.title()
    model_instance.encode()
    model_instance.fillna()
    model_instance.prepare_set()

    for classifier_type in Model.classifier_type:
        prediction = model_instance.predict(classifier_type)
        model_instance.export_csv(prediction, classifier_type)

    print('best_model : ')
    print(Model.best_model)


    print('best_accuracy : ')
    print(Model.best_accuracy)