因为你用
machine-learning
,我建议您使用
sklearn.CountVectorizer
:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['He is a good person',
'He is bad student',
'He is hardworking']
df = pd.DataFrame(data=corpus, columns=['sentences'])
vectorizer = CountVectorizer(vocabulary=['he', 'is', 'a', 'good', 'person', 'bad', 'student', 'hardworking'], min_df=0,
stop_words=frozenset(), token_pattern=r"(?u)\b\w+\b")
X = vectorizer.fit_transform(df['sentences'].values)
result = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())
print(result)
产量
he is a good person bad student hardworking
0 1 1 1 1 1 0 0 0
1 1 1 0 0 0 1 1 0
2 1 1 0 0 0 0 0 1