examples/named_entity_recognition/routine/text_preprocessor.py

28 lines
830 B
Python

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import text
class TextPreprocessor():
def __init__(self, max_sequence_length):
self._max_sequence_length = max_sequence_length
self._labels = None
self.number_words = None
self._tokenizer = None
def fit(self, instances):
tokenizer = text.Tokenizer(lower=False, filters=[], oov_token=None)
tokenizer.fit_on_texts(instances)
self._tokenizer = tokenizer
self.number_words = len(tokenizer.word_index)
print(self.number_words)
def transform(self, instances):
sequences = self._tokenizer.texts_to_sequences(instances)
padded_sequences = pad_sequences(
maxlen=140,
sequences=sequences,
padding="post",
value=self.number_words - 1)
return padded_sequences