[AI]/python.sklearn
sklearn.feature_extraction.text.CountVectorizer.ngram_range적용
givemebro
2020. 4. 28. 15:23
반응형
ngram : n개의 단어로 만든 단어집
ex) s='I am Tam'
2gram : [I am], [am Tam]
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer(ngram_range=(1,2))
X_train=vect.fit_transform(text_train)
len(vect.get_feature_names())
1522634
# 간략하게 2단어로 이뤄진 word 확인
#1
# count=0
# for key,value in vect.vocabulary_.items():
# # value=vect.vocabulary_[key]
# if len(key.split())>1:
# print(key,value)
# count+=1
# if count>=100:break
#2
# count=0
# for key in vect.vocabulary_:
# value=vect.vocabulary_[key]
# if len(key.split())>1:
# print(key,value)
# count+=1
# if count>=100:break
#3
# count=0
# for i,(key,value) in enumerate(vect.vocabulary_.items()):
# # value=vect.vocabulary_[key]
# if len(key.split())>1:
# print(i,key,value) # i : dict 에서 불러온 순서
# count+=1
# if count>=100:break

# ngram_range=(1,2) vect에 Tfidf,LogisticRegression 적용
from sklearn.feature_extraction.text import TfidfVectorizer
vect=TfidfVectorizer(min_df=5,ngram_range=(1,2))
X_train=vect.fit_transform(text_train)
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores=cross_val_score(LogisticRegression(),X_train,y_train)
scores
array([0.88540917, 0.89020878, 0.88754201])반응형