Running KeyBERT to extract keywords on Google Colab gives with the following codes:
from keybert import KeyBERT
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words =None)
print(keywords)
But I get a TypeError: 'DistilBertTokenizer' object is not callable. I was checking another post on stackoverflow. I'm guessing that I probably shouldn't call extract_keywords directly. Any advise?
The full log is copied:
TypeError Traceback (most recent call last)
<ipython-input-18-f06d098e147a> in <module>()
----> 1 keywords = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words =None)
2 print(keywords)
5 frames
/usr/local/lib/python3.7/dist-packages/keybert/model.py in extract_keywords(self, docs, candidates, keyphrase_ngram_range, stop_words, top_n, min_df, use_maxsum, use_mmr, diversity, nr_candidates, vectorizer)
112 diversity=diversity,
113 nr_candidates=nr_candidates,
--> 114 vectorizer=vectorizer)
115 elif isinstance(docs, list):
116 warnings.warn("Although extracting keywords for multiple documents is faster "
/usr/local/lib/python3.7/dist-packages/keybert/model.py in _extract_keywords_single_doc(self, doc, candidates, keyphrase_ngram_range, stop_words, top_n, use_maxsum, use_mmr, diversity, nr_candidates, vectorizer)
163
164 # Extract Embeddings
--> 165 doc_embedding = self.model.embed([doc])
166 candidate_embeddings = self.model.embed(candidates)
167
/usr/local/lib/python3.7/dist-packages/keybert/backend/_sentencetransformers.py in embed(self, documents, verbose)
51 that each have an embeddings size of `m`
52 """
---> 53 embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
54 return embeddings
/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py in encode(self, sentences, batch_size, show_progress_bar, output_value, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)
154 for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
155 sentences_batch = sentences_sorted[start_index:start_index+batch_size]
--> 156 features = self.tokenize(sentences_batch)
157 features = batch_to_device(features, device)
158
/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py in tokenize(self, texts)
307 Tokenizes the texts
308 """
--> 309 return self._first_module().tokenize(texts)
310
311 def get_sentence_features(self, *features):
/usr/local/lib/python3.7/dist-packages/sentence_transformers/models/Transformer.py in tokenize(self, texts)
98
99
--> 100 output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length))
101 return output
102
TypeError: 'DistilBertTokenizer' object is not callable
I tried using a different model (BertTokenizer) but the error persists. Any advise is appreciated.