从数据库读取数据保存为CSV,然后转换为HDF5,用于后面数据快速处理
from sqlalchemy import create_engine
import cx_Oracle as cx
import pandas as pd
class Analyze:
dsnStr = cx.makedsn("ip", "port", service_name="spvdb")
engine = create_engine("oracle://username:password@%s" % dsnStr)
conn = engine.connect().execution_options(stream_results=True)
def save_csv(self, query, filename):
"""
保存为CSV文件
"""
for chunk in pd.read_sql(sql=query, con=self.conn, chunksize=50000):
chunk.to_csv(filename, encoding="utf_8_sig", mode="a", index=False)
del chunk
print("保存文件中...")
print("保存CSV文件完成")
def save_hdf5(self, csv_table, hdf_table):
"""
保存为HDF5文件
"""
"""
上面这种方法不行,最后只添加最后一分块的数据
"""
store = pd.HDFStore(hdf_table, mode="w")
for chunk in pd.read_csv(csv_table, chunksize=50000):
chunk = pd.DataFrame(chunk).astype(str)
store.append("df", chunk)
del chunk
print("保存文件中...")
store.close()
print("保存HDF5文件完成")
def read(self):
realtime = pd.read_hdf("realtime.hd5")
print(realtime)
if __name__ == '__main__':
rs = Analyze()
rs.read()
之前在网上找的分块读取的,但是实际使用的时候,速度太慢(千万行数据),记录一下
def query_result(query):
chunk_list = []
for chunk in pd.read_sql(query, con=conn, chunksize=50000):
chunk_list.append(chunk)
del chunk
dfs = pd.concat(chunk_list, ignore_index=True)
return dfs
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)