I am playing around with some self-paced learning for creating and configuring LLMs for personal use. In this scenario, I am trying to connect to AstraDB to store the headlines from a sample of news articles in a vector database
The python code is below:
ASTRA_DB_SECURE_BUNDLE_PATH = <INSERT PATH>.zip #This is in a zip file downloaded from AstraDB
ASTRA_DB_APPLICATION_TOKEN = <INSERT TOKEN>
ASTRA_DB_CLIENT_ID = <INSERT CLIENT_ID>
ASTRA_DB_CLIENT_SECRET = <INSERT CLIENT_SECRET>
ASTRA_DB_KEYSPACE_NAME = <INSERT KEYSPACE NAME>
OPEN_API_KEY = <INSERT OPENAI KEY>
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from datasets import load_dataset
cloud_config= {
'secure_connect_bundle': ASTRA_DB_SECURE_BUNDLE_PATH
}
auth_provider = PlainTextAuthProvider(ASTRA_DB_CLIENT_ID, ASTRA_DB_CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
astraSession = cluster.connect()
llm = OpenAI(openai_api_key=OPEN_API_KEY)
myEmbedding = OpenAIEmbeddings(openai_api_key=OPEN_API_KEY)
myCassandraVStore = Cassandra(
embedding = myEmbedding,
session = astraSession,
keyspace = ASTRA_DB_KEYSPACE_NAME,
table_name = "qa_mini_demo",
)
print("loading data from huggingface")
myDataset = load_dataset("Biddls/Onion_News", split = "train")
headlines = myDataset["text"][:50]
print("\nGenerating embeddings and storing in AstraDB")
myCassandraVStore.add_texts(headlines)
print("Inserted %i headlines.\n" % len(headlines))
When I run the file I receive the following error:
Traceback (most recent call last):
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\cassandra\datastax\cloud\__init__.py", line 138, in read_metadata_info
response = urlopen(url, context=config.ssl_context, timeout=timeout)
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 216, in urlopen
return opener.open(url, data, timeout)
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 519, in open
response = self._open(req, data)
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 536, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 496, in _call_chain
result = func(*args)
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1391, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1352, in do_open
r = h.getresponse()
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 1375, in getresponse
response.begin()
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 318, in begin
version, status, reason = self._read_status()
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 279, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\socket.py", line 705, in readinto
return self._sock.recv_into(b)
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\ssl.py", line 1274, in recv_into
return self.read(nbytes, buffer)
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\ssl.py", line 1130, in read
return self._sslobj.read(len, buffer)
TimeoutError: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\PATH\", line 22, in <module>
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
File "cassandra\cluster.py", line 1132, in cassandra.cluster.Cluster.__init__
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\cassandra\datastax\cloud\__init__.py", line 92, in get_cloud_config
config = read_metadata_info(config, cloud_config)
File "C:\PATH\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\cassandra\datastax\cloud\__init__.py", line 141, in read_metadata_info
raise DriverException("Unable to connect to the metadata service at %s. "
cassandra.DriverException: Unable to connect to the metadata service at https://3b3b9a1d-bb70-4078-8d4f-5b0e69e5a4b3-us-east1.db.astra.datastax.com:29080/metadata. Check the cluster status in the cloud console.
I have double, triple, and quadruple-checked that the cluster is active. My guess is that the timeout error is creating a problem, maybe because of a slow internet connection, but I do not know how to run a test with different lengths of time.
Appreciate any insight here.
How about if you update it as follows?