-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.py
More file actions
68 lines (53 loc) · 2.13 KB
/
tokenizer.py
File metadata and controls
68 lines (53 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from abc import ABC, abstractmethod
import re
# import spacy
# import stanza
# import spacy_stanza
# from negspacy.negation import Negex
# from negspacy.termsets import termset
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
class Tokenizer(ABC):
@abstractmethod
def tokenize(self, text: str) -> list:
pass
class NltkTokenizer(Tokenizer):
def __init__(self, language) -> None:
self.stopwords = set(stopwords.words(language))
self.stemmer = SnowballStemmer(language=language)
self.lemmatizer = WordNetLemmatizer()
def tokenize(self, text: str) -> list:
# replace punctuation with spaces
text = re.sub(r"[^\~\w\s]", " ", text)
# remove all special characters
# text = self.clean_text(text)
text = self.remove_digits(text)
# tokenize the document text
words = word_tokenize(text)
# remove stopwords from the text
words = [word.lower() for word in words if word not in self.stopwords]
# stem words in document
words = [self.lemmatizer.lemmatize(word) for word in words]
return words
def remove_digits(self, text):
''' removes digits from text'''
regex = re.compile(r"\d")
# Replace and return
return re.sub(regex, "", text)
def clean_text(self, text):
''' removes special characters from text'''
# print("text", text)
text = text.replace(",.;:", " ")
# Regex pattern for a word
regex = re.compile(r"[^\~\|\&a-zA-Z0-9\s]")
# Replace and return
return re.sub(regex, "", text)
# # Ignore this class for now
# class SpacyTokenizer(Tokenizer):
# def __init__(self) -> None:
# self.nlp_model = spacy_stanza.load_pipeline('en', download_method='REUSE_RESOURCES')
# def tokenize(self, text: str) -> list:
# tokenized_text = self.nlp_model(text)
# return [token.lemma_.lower() for token in tokenized_text if not token.lemma_.lower()=='\n' and not token.is_punct and not token.is_stop]