<aside> 🔥
주어진 데이터를 가지고, 최종 y값인 “weather label”을 예측하는 모델을 구축하고, test set을 가지고 예측한 결과를 csv로 저장.
</aside>
!pip install category_encoders
!pip install optuna
import pandas as pd
import numpy as np
import re
from pandas.plotting import scatter_matrix
import time
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.base import clone
from sklearn.metrics import make_scorer, StratifiedKFold, accuracy_score, log_loss, mean_squared_error , f1_score
from category_encoders.target_encoder import TargetEncoder
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings(action='ignore')
import csv
def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
seed_everything(42) #seed 고정
train_data = pd.read_csv('./data_train.csv')
test_data = pd.read_csv('./data_test.csv')
print(train_data)
print("훈련 세트 크기:", len(train_data))
print("테스트 세트 크기:", len(test_data))
train_data.info()
훈련 세트 크기: 2556 테스트 세트 크기: 1096
# 결측치 확인
print("\\nTrain Data Missing Values:")
print(train_data.isnull().sum())
print("\\nTest Data Missing Values:")
print(test_data.isnull().sum())
train data, test data 결측치 존재 확인.
drop_x = ["weather label"] #예측할 label 제거
target = 'weather label'
train_data = train_data.astype(float)
test = test_data.astype(float)
train_data = train_data.dropna() #NaN data
#.dropna() ~ 결측치 아예 제거
# 중복 데이터 제거
before_train_size = train_data.shape[0]
train = train_data.drop_duplicates()
print(f'중복 데이터 제거 개수 : {before_train_size - train.shape[0]}')
train.reset_index(drop=True, inplace=True)
## => 중복 데이터 X
# 중복 데이터 제거 개수 : 0
# 독립변수 : weather label 제외 모두
x = train_data[['temperature.avg.', 'temperature.min.',
'temperature.max.',
'wind speed.max.',
'wind direction.max.',
'wind speed.avg.',
'dew point.avg.',
'humidity.avg.',
'atmospheric pressure.avg.',
'insolation time.sum.',
'insolation.sum.',
'cloud.avg.',
'ground temperature.avg',
'1.0m temperature.avg',
'5.0m temperature.avg']]
# 종속변수 : weather label
y = train_data['weather label']
#절편 추가
X = sm.add_constant(x)
#모형 학습
model = sm.OLS(y,X).fit()
#예측치 생성
predictions = model.predict(X)
model.summary()