TfidfVectorizer
允许自定义预处理器。您可以使用它来进行任何需要的调整。
例如,要从示例语料库中删除所有出现的连续“red”+“roses”标记(不区分大小写),请使用:
import re
from sklearn.feature_extraction import text
cases = ["I like red roses as much as I like blue tulips.",
"It would be quite unusual to see red tulips, but not RED ROSES",
"It is almost impossible to find blue roses",
"I like most red flowers, but roses are my favorite.",
"Could you buy me some red roses?",
"John loves the color red. Roses are Mary's favorite flowers."]
# remove_stop_phrases() is our custom preprocessing function.
def remove_stop_phrases(doc):
# note: this regex considers "... red. Roses..." as fair game for removal.
# if that's not what you want, just use ["red roses"] instead.
stop_phrases= ["red(\s?\\.?\s?)roses"]
for phrase in stop_phrases:
doc = re.sub(phrase, "", doc, flags=re.IGNORECASE)
return doc
sw = text.ENGLISH_STOP_WORDS
mod_vectorizer = text.TfidfVectorizer(
ngram_range=(2,3),
stop_words=sw,
norm='l2',
min_df=1,
preprocessor=remove_stop_phrases # define our custom preprocessor
)
dtm = mod_vectorizer.fit_transform(cases).toarray()
vocab = np.array(mod_vectorizer.get_feature_names())
Now vocab
拥有所有red roses
参考资料已删除。
print(sorted(vocab))
['Could buy',
'It impossible',
'It impossible blue',
'It quite',
'It quite unusual',
'John loves',
'John loves color',
'Mary favorite',
'Mary favorite flowers',
'blue roses',
'blue tulips',
'color Mary',
'color Mary favorite',
'favorite flowers',
'flowers roses',
'flowers roses favorite',
'impossible blue',
'impossible blue roses',
'like blue',
'like blue tulips',
'like like',
'like like blue',
'like red',
'like red flowers',
'loves color',
'loves color Mary',
'quite unusual',
'quite unusual red',
'red flowers',
'red flowers roses',
'red tulips',
'roses favorite',
'unusual red',
'unusual red tulips']
UPDATE(每个评论线程):
要将所需的停止短语与自定义停止词一起传递给包装函数,请使用:
desired_stop_phrases = ["red(\s?\\.?\s?)roses"]
desired_stop_words = ['Could', 'buy']
def wrapper(stop_words, stop_phrases):
def remove_stop_phrases(doc):
for phrase in stop_phrases:
doc = re.sub(phrase, "", doc, flags=re.IGNORECASE)
return doc
sw = text.ENGLISH_STOP_WORDS.union(stop_words)
mod_vectorizer = text.TfidfVectorizer(
ngram_range=(2,3),
stop_words=sw,
norm='l2',
min_df=1,
preprocessor=remove_stop_phrases
)
dtm = mod_vectorizer.fit_transform(cases).toarray()
vocab = np.array(mod_vectorizer.get_feature_names())
return vocab
wrapper(desired_stop_words, desired_stop_phrases)