SRI_FINAL/tokenizer.py at main · geeksLabTech/SRI_FINAL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

from abc import ABC, abstractmethod
import re
# import spacy
# import stanza
# import spacy_stanza
# from negspacy.negation import Negex
# from negspacy.termsets import termset

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


class Tokenizer(ABC):
    @abstractmethod
    def tokenize(self, text: str) -> list:
        pass


class NltkTokenizer(Tokenizer):
    def __init__(self, language) -> None:
        self.stopwords = set(stopwords.words(language))
        self.stemmer = SnowballStemmer(language=language)
        self.lemmatizer = WordNetLemmatizer()

    def tokenize(self, text: str) -> list:
        # replace punctuation with spaces
        text = re.sub(r"[^\~\w\s]", " ", text)
        # remove all special characters
        # text = self.clean_text(text)
        text = self.remove_digits(text)
        # tokenize the document text
        words = word_tokenize(text)
        # remove stopwords from the text
        words = [word.lower() for word in words if word not in self.stopwords]
        # stem words in document
        words = [self.lemmatizer.lemmatize(word) for word in words]
        return words

    def remove_digits(self, text):
        ''' removes digits from text'''
        regex = re.compile(r"\d")
        # Replace and return
        return re.sub(regex, "", text)

    def clean_text(self, text):
        ''' removes special characters from text'''
        # print("text", text)
        text = text.replace(",.;:", " ")
        # Regex pattern for a word
        regex = re.compile(r"[^\~\|\&a-zA-Z0-9\s]")
        # Replace and return
        return re.sub(regex, "", text)


# # Ignore this class for now
# class SpacyTokenizer(Tokenizer):
#     def __init__(self) -> None:
#         self.nlp_model = spacy_stanza.load_pipeline('en', download_method='REUSE_RESOURCES')


#     def tokenize(self, text: str) -> list:
#         tokenized_text = self.nlp_model(text)
#         return [token.lemma_.lower() for token in tokenized_text if not token.lemma_.lower()=='\n' and not token.is_punct and not token.is_stop]