I'm trying to scan remote HBASE table which has more than 1,000,000,000 rows. After scan, using scanned rows, Try to make csv file using in hdfs.
I tried almost 3 weeks to solve it But i can't.
In this way i scan data and make csv file
source of /host/anaconda3/lib/python3.6/site-packages/thriftpy/transport/socket.py
source of /host/anaconda3/lib/python3.6/site-packages/thriftpy/transport/socket.py
==> I have tried compat protocol , increase network tcp memory buffer, increase time out configuration, setting 1 to 10000 batch size in scan parameter etc..
But it works well almost for 30 minutes,But suddenly error happens. Almost 1/50 times it finishes well.(works well without any error) Please helps me. I tried to find the cause of Error. But i can't get it.
Anybody knows how to solve it?
This is my code
import sys
print ("--sys.version--")
print (sys.version)
from pyhive import hive
import csv
import os
import happybase
import time
import subprocess
import datetime
import chardet
import logging
logging.basicConfig(level=logging.DEBUG)
csv_list=[]
col=[]
def conn_base():
print('conn_base starts')
#SETTING CONNECTION AND CONFIGURATION
conn=happybase.Connection('13.xxx.xxx.xxx',port=9090)
table=conn.table(b'TEMP_TABLE')
#ITERATE DATA AND MAKE CSV FILE PER 100,000 RECORD. AND TAKE A TIME TO SLEEP PER 500000
tmp=[]
print('LET\'S MAKE CSV FILE FROM HBASE')
index=0
st=0
global csv_list
for row_key, data in table.scan():
try:
if (st%1000000==0):
time.sleep(30)
print("COUNT: ",st)
if (st%500000==0):
print("CHANGE CSV _FILE")
index+=1
ta_na='TEMP_TABLE'+str(index)+'_version.csv'
csv_list.append(ta_na)
st+=1
with open('/home/host01/csv_dir/TEMP_TABLE/'+csv_list[index-1] ,'a') as f:
tmp=[]
tmp.append(data[b'CF1:XXXXX'].decode())
tmp.append(data[b'CF1:YYYYY'].decode())
tmp.append(data[b'CF1:DDDDD'].decode())
tmp.append(data[b'CF1:SSSSS'].decode())
tmp.append(data[b'CF1:GGGGG'].decode())
tmp.append(data[b'CF1:HHHHH'].decode())
tmp.append(data[b'CF1:QQQQQ'].decode())
tmp.append(data[b'CF1:WWWWWW'].decode())
tmp.append(data[b'CF1:EEEEE'].decode())
tmp.append(data[b'CF1:RRRRR'].decode())
f.write(",".join(tmp)+'\n')
tmp=[]
except:
pass
#PUT CSV FILES TO HDFS.
st=1
for i in range(len(csv_list)):
try:
st+=1
cmd="hdfs dfs -put /home/host01/csv_dir/TEMP_TABLE"+str(csv_list[i])+" /user/hive/warehouse/TEMP_TABLE/"
subprocess.call(cmd,shell=True)
if (st%50==0):
time.sleep(5)
except:
pass
cmd="hdfs dfs -put /home/host01/csv_dir/TEMP_TABLE/*.csv /user/hive/warehouse/TEMP_TABLE/"
subprocess.call(cmd,shell=True)
print("PUT ALL CSV FILES TO HDFS")
conn.close()
First make sure the HBase Thrift server is up and running. You can run thrift server with following command:
If you want to specify a port number, use -p. Default port is
9090