I created a Dask dataframe using read_sql_query method and when I try to compute (some operation like pivoting, require computing), it raises an SQL error stating that it did not find the column I initially set as index.
import pyodbc
import pandas as pd
from dask.dataframe import read_sql_query
import dask as dk
from sqlalchemy import sql
import urllib
from sqlalchemy import create_engine
params = urllib.parse.quote_plus(conn_string)
engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
url_str = f"mssql+pyodbc:///?odbc_connect=%s" % params
def _remove_leading_select_from_query(query):
if query.startswith("SELECT "):
return query.replace("SELECT ", "", 1)
else:
return query
def query_sql_to_dataframe(conn_string, query,index):
try:
conection = pyodbc.connect(conn_string)
sa_query = sql.select(sql.text(_remove_leading_select_from_query(query)))
df = read_sql_query(sa_query, url_str,index_col=index)
except pyodbc.Error as ex:
df = None
finally:
if conection :
conection .close()
return df
# FT_DOC_ELETRONICO
query = 'SELECT ROW_NUMBER() OVER(ORDER BY some_column ASC) AS row,* FROM dbo.my_table'
idx = "row"
doc_eletr = query_sql_to_dataframe(conn_string, query,index=idx)
All the data operations work fine but when I need to compute the dataframe, it raises an eror as follows:
ProgrammingError: (pyodbc.ProgrammingError) ('42S22', "[42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'row'. (207) (SQLExecDirectW); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'row'. (207); [42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Statements could not be prepared (8180)") [SQL: SELECT ROW_NUMBER() OVER(ORDER BY some_column ASC) AS row,* FROM dbo.my_table WHERE row >= ? AND row <= ?] [parameters: (1, 1181321)] (Background on this error at: https://sqlalche.me/e/20/f405)
I tried changing the column but every column I set as index, Dask raises the same error claiming, it did not find it. What could I be doing wrong?