Receive "TypeError: 'DistilBertTokenizer' object is not callable" when using KeyBERT on Colab

836 views Asked by At

Running KeyBERT to extract keywords on Google Colab gives with the following codes:

from keybert import KeyBERT
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words =None)
print(keywords)

But I get a TypeError: 'DistilBertTokenizer' object is not callable. I was checking another post on stackoverflow. I'm guessing that I probably shouldn't call extract_keywords directly. Any advise?

The full log is copied:

    TypeError                                 Traceback (most recent call last)
<ipython-input-18-f06d098e147a> in <module>()
----> 1 keywords = model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words =None)
      2 print(keywords)

5 frames
/usr/local/lib/python3.7/dist-packages/keybert/model.py in extract_keywords(self, docs, candidates, keyphrase_ngram_range, stop_words, top_n, min_df, use_maxsum, use_mmr, diversity, nr_candidates, vectorizer)
    112                                                      diversity=diversity,
    113                                                      nr_candidates=nr_candidates,
--> 114                                                      vectorizer=vectorizer)
    115         elif isinstance(docs, list):
    116             warnings.warn("Although extracting keywords for multiple documents is faster "

/usr/local/lib/python3.7/dist-packages/keybert/model.py in _extract_keywords_single_doc(self, doc, candidates, keyphrase_ngram_range, stop_words, top_n, use_maxsum, use_mmr, diversity, nr_candidates, vectorizer)
    163 
    164             # Extract Embeddings
--> 165             doc_embedding = self.model.embed([doc])
    166             candidate_embeddings = self.model.embed(candidates)
    167 

/usr/local/lib/python3.7/dist-packages/keybert/backend/_sentencetransformers.py in embed(self, documents, verbose)
     51             that each have an embeddings size of `m`
     52         """
---> 53         embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
     54         return embeddings

/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py in encode(self, sentences, batch_size, show_progress_bar, output_value, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)
    154         for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
    155             sentences_batch = sentences_sorted[start_index:start_index+batch_size]
--> 156             features = self.tokenize(sentences_batch)
    157             features = batch_to_device(features, device)
    158 

/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py in tokenize(self, texts)
    307         Tokenizes the texts
    308         """
--> 309         return self._first_module().tokenize(texts)
    310 
    311     def get_sentence_features(self, *features):

/usr/local/lib/python3.7/dist-packages/sentence_transformers/models/Transformer.py in tokenize(self, texts)
     98 
     99 
--> 100         output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length))
    101         return output
    102 

TypeError: 'DistilBertTokenizer' object is not callable

I tried using a different model (BertTokenizer) but the error persists. Any advise is appreciated.

0

There are 0 answers