import pandas as pd #pandas package 임포트
from sklearn.preprocessing import LabelEncoder #범주형 데이터의 실수화 함수 임포트
#from 모듈 import 이름

#sklearn 모듈
from sklearn.preprocessing import OneHotEncoder #더미변수 생성, 가변환 함수 임포트


x_train = pd.DataFrame(["남성" , "여성" , "남성" , "여성" , "남성" , "여성"], columns=["성별"])

#열이 성별인 DataFrame 생성 
#남성 여성 남성 여성 남성 여성인 행 생성

x_train.head(3)
#x_train dataframe의 대표 3행 출력


x_train.info() #x_train의 정보 출력

#데이터 프레임의 정보 확인 : object 타입 '성별' 변수

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   성별      6 non-null      object
dtypes: object(1)
memory usage: 176.0+ bytes


x_train['성별'].value_counts() #범주형 데이터의 개수 확인

남성    3
여성    3
Name: 성별, dtype: int64


#레이블 인코더 생성
encoing = LabelEncoder()
#sklearn.preprocessing 모듈에서 받은 LabelEncoder 라이브러리 함수를 encoing 변수로 받아들인다.

#X_train 데이터를 이용하여 피팅하고 레이블 숫자로 변환
encoing.fit(x_train["성별"]) #성별 열의 데이터를 범주형 데이터로 fit

LabelEncoder()

LabelEncoder()


x_train["성별_인코딩"] = encoing.transform(x_train["성별"])
#x_train dataframe 의 성별_인코딩 열 추가하여 
#레이블 인코더로 변환할 x_train의 성별열의 행들 범주형 데이터로 변환한다.


x_train


print(list(encoing.classes_)) #클래스 확인
print(list(encoing.inverse_transform([1,0]))) #인코딩 값으로 문자값 확인

['남성', '여성']
['여성', '남성']


#One-Hot Encoding : 더미변수 생성 , 가변환

x_train.head(2) #데이터 확인(실수화한 데이터 사용)


#2) One-Hot Encoding

#원핫인코더 생성
#sparse를 True로 할 경우 "(행,열) 1"의 좌표리스트의 형식, False로 할 경우 넘파이 배열로 변환
one_encoding = OneHotEncoder(sparse= False) #넘파이 배열

#X_train 데이터를 이용하여 피팅
one_encoding.fit(x_train[["성별"]]) #피팅이란 학습 시키는 것

#가변환값 변환
one_encoding.transform(x_train[["성별"]])
#x_train 데이터프레임의 성별

#one_encoding.fit_transform(x_train[["성별"]])

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.]])


#가변화된 피쳐 확인
print(one_encoding.get_feature_names())

['x0_남성' 'x0_여성']

C:\Users\user\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)


#가변화된 값을 x_train_one 데이터 프레임으로 저장
x_train_one = pd.DataFrame(one_encoding.transform(x_train[['성별']]),
columns= ['성별0','성별1'])


#x_train 데이터와 x_train_one의 가변화된 속성 합침
x_train = pd.concat([x_train, x_train_one], axis = 1)


#결과확인

x_train.head(3)


from sklearn.feature_extraction.text import CountVectorizer
#문서 집합에서 단어 토큰을 생성하고 각 단어의 수를 세어 Bag Of Words 로 인코딩하는 함수


corpus = [ 
    '청년 인재 개발 양성 과정',
    '인공지능 청년 양성',
    '미래 인공지능 데이터 대한민국',
    '데이터 원유 기술사 청년 개발'

]


corpus

['청년 인재 개발 양성 과정', '인공지능 청년 양성', '미래 인공지능 데이터 대한민국', '데이터 원유 기술사 청년 개발']


type(corpus)

list


#카운트 벡터라이저 생성
count_vect = CountVectorizer()

#단어 카운트
count_vect.fit(corpus)

#단어 확인

count_vect.vocabulary_

{'청년': 10,
 '인재': 9,
 '개발': 0,
 '양성': 6,
 '과정': 1,
 '인공지능': 8,
 '미래': 5,
 '데이터': 4,
 '대한민국': 3,
 '원유': 7,
 '기술사': 2}


#단어 벡터화한 값을 array로 변환하여 확인
count_vect.transform(corpus).toarray()

array([[1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1],
       [0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0],
       [1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1]], dtype=int64)


#단어 벡터화
features = count_vect.transform(corpus)


features

<4x11 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>


#4) 문서 단어 행렬로 변환

#속성 이름만 반환

vocab = count_vect.get_feature_names()

vocab

C:\Users\user\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)

['개발', '과정', '기술사', '대한민국', '데이터', '미래', '양성', '원유', '인공지능', '인재', '청년']


#문서단어행렬(DTM)을 데이터 프레임으로 변환

DTM = pd.DataFrame(features.toarray(), columns = vocab).head()

DTM


#문서단어행렬에 일치하는 단어 확인

count_vect.transform(['기술사 대한민국 인재 만세']).toarray()

array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0]], dtype=int64)


#TF-IDF : 단어의 빈도와 역 문서 빈도를 사용하여 DTM내의 각 단어들마다 중요한 정도를 가중치로 주는 방법


#TF-IDF 값이 낮으면 중요도가 낮은 것이며, TF-IDF값이 크면 중요도가 큰것이다.

from sklearn.feature_extraction.text import TfidfVectorizer
#Tfid'f'


corpus

['청년 인재 개발 양성 과정', '인공지능 청년 양성', '미래 인공지능 데이터 대한민국', '데이터 원유 기술사 청년 개발']


#TF-IDF 벡터라이저 생성
tfid = TfidfVectorizer()


#단어 카운트
tfid.fit(corpus)

TfidfVectorizer()

TfidfVectorizer()


#단어사전 확인
tfid.vocabulary_

{'청년': 10,
 '인재': 9,
 '개발': 0,
 '양성': 6,
 '과정': 1,
 '인공지능': 8,
 '미래': 5,
 '데이터': 4,
 '대한민국': 3,
 '원유': 7,
 '기술사': 2}


#단어 벡터화한 값을 array로 변환하여 확인
tfid.transform(corpus).toarray()

array([[0.41263976, 0.52338122, 0.        , 0.        , 0.        ,
        0.        , 0.41263976, 0.        , 0.        , 0.52338122,
        0.33406745],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.61366674, 0.        , 0.61366674, 0.        ,
        0.49681612],
       [0.        , 0.        , 0.        , 0.55528266, 0.43779123,
        0.55528266, 0.        , 0.        , 0.43779123, 0.        ,
        0.        ],
       [0.41263976, 0.        , 0.52338122, 0.        , 0.41263976,
        0.        , 0.        , 0.52338122, 0.        , 0.        ,
        0.33406745]])


#단어 벡터화
features_idf = tfid.transform(corpus)


features_idf

<4x11 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>


#4) 문서 단어행렬로 변환

#속성 이름만 반환

vocab_idf= count_vect.get_feature_names()

C:\Users\user\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)


vocab_idf

['개발', '과정', '기술사', '대한민국', '데이터', '미래', '양성', '원유', '인공지능', '인재', '청년']


#문서단어행렬(DTM)을 데이터 프레임으로 변환

DTM_idf = pd.DataFrame(features_idf.toarray(), columns = vocab_idf).head()


DTM_idf

	개발	과정	기술사	대한민국	데이터	미래	양성	원유	인공지능	인재	청년
0	0.41264	0.523381	0.000000	0.000000	0.000000	0.000000	0.412640	0.000000	0.000000	0.523381	0.334067
1	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.613667	0.000000	0.613667	0.000000	0.496816
2	0.00000	0.000000	0.000000	0.555283	0.437791	0.555283	0.000000	0.000000	0.437791	0.000000	0.000000
3	0.41264	0.000000	0.523381	0.000000	0.412640	0.000000	0.000000	0.523381	0.000000	0.000000	0.334067

JETBRAINS(INTELLIJ , DATASPELL 등) 한국어 설치하는 방법 (0)	2023.01.08
★TensorFlow 설치★Keras 설치★ in Anaconda Prompt (0)	2022.11.28
anaconda 파이썬 업데이트시 생기는 오류 해결 (0)	2022.10.19
Could not fetch URL https://pypi.org/simple/missingno/ (0)	2022.09.13
쥬피터(아나콘다) 가상화 사용하기(anaconda Virtual environment) (0)	2022.07.25

뭐든지 다 알아보자

Menu

Category

Notice

Recent comments

Links

데이터정제_데이터실수화

'IT에대해 알아보자 > 쥬피터(ANACONDA)' 카테고리의 다른 글

+ Recent posts

티스토리툴바