Вы можете использовать string.punctuation со встроенным списком стоп-слов NLTK:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
words = tokenize(text)
wordsWOStopwords = removeStopWords(words)
def tokenize(text):
sents = sent_tokenize(text)
return [word_tokenize(sent) for sent in sents]
def removeStopWords(words):
customStopWords = set(stopwords.words('english')+list(punctuation))
return [word for word in words if word not in customStopWords]
Завершить полные блокировки NLTK list