com.aliyun.odps.datahub.DatahubReader Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.aliyun.odps.datahub;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.Map;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.TableSchema;
import com.aliyun.odps.commons.proto.ProtobufRecordStreamReader;
import com.aliyun.odps.commons.proto.XstreamPack.XStreamPack;
import com.aliyun.odps.commons.transport.Connection;
import com.aliyun.odps.commons.transport.Response;
import com.aliyun.odps.commons.util.IOUtils;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.rest.RestClient;
public class DatahubReader {
private RestClient datahubServiceClient;
private TableSchema tableSchema;
private String path;
private MessageDigest messageDigest;
private Map params;
private Map headers;
private byte[] packMeta;
private String lastPackId;
private String nextPackId;
private PackType.ReadMode readMode;
private ProtobufRecordStreamReader protobufRecordStreamReader;
public DatahubReader(RestClient datahubServiceClient, TableSchema tableSchema, String path,
Map params, Map headers) {
this(datahubServiceClient, tableSchema, path, params, headers, PackType.FIRST_PACK_ID);
}
public DatahubReader(RestClient datahubServiceClient, TableSchema tableSchema, String path,
Map params, Map headers,
String packId) {
this.datahubServiceClient = datahubServiceClient;
this.tableSchema = tableSchema;
this.path = path;
this.params = params;
this.headers = headers;
this.packMeta = null;
try {
this.messageDigest = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e.getMessage());
}
this.protobufRecordStreamReader = null;
seek(packId, PackType.ReadMode.SEEK_CUR);
}
/**
* 从指定的shard读取一条记录
*
* @return {@link com.aliyun.odps.data.Record}
* @throws OdpsException, IOException
*/
public Record read() throws OdpsException, IOException {
Record r = null;
do {
if (protobufRecordStreamReader != null) {
try {
r = protobufRecordStreamReader.read();
} catch (IOException e) {
protobufRecordStreamReader = null;
this.nextPackId = this.lastPackId;
this.readMode = PackType.ReadMode.SEEK_CUR;
throw e;
}
}
} while (r == null && getPack("all"));
return r;
}
/**
* 跳过某条Record不进行数据读取
*/
public void skipPack() {
seek(this.lastPackId, PackType.ReadMode.SEEK_NEXT);
}
/**
* 获取当前从服务端读取的pack的packId
*
* @return String PackId
*/
public String getLastPackId() {
return this.lastPackId;
}
private void seek(String rpid, PackType.ReadMode mode) {
if ((rpid == null || rpid.equals("")) && !(mode.equals(PackType.ReadMode.SEEK_BEGIN)
|| mode.equals(PackType.ReadMode.SEEK_END))) {
throw new IllegalArgumentException("Invalid pack id.");
}
switch (mode) {
case SEEK_BEGIN:
nextPackId = PackType.FIRST_PACK_ID;
break;
case SEEK_END:
nextPackId = PackType.LAST_PACK_ID;
break;
case SEEK_CUR:
case SEEK_NEXT:
nextPackId = rpid.toString();
break;
default:
throw new IllegalArgumentException("Invalid pack read mode.");
}
//lastPackId = null;
readMode = mode;
protobufRecordStreamReader = null;
}
public SeekPackResult seek(long timeStamp) throws OdpsException, IOException {
HashMap params = new HashMap(this.params);
HashMap headers = new HashMap(this.headers);
try {
params.put(DatahubConstants.SEEK_TIME, Long.toString(timeStamp));
Connection conn = datahubServiceClient.connect(path, "GET", params, headers);
Response resp = conn.getResponse();
if (!resp.isOK()) {
DatahubException ex = new DatahubException(conn.getInputStream());
ex.setRequestId(resp.getHeader(DatahubHttpHeaders.HEADER_ODPS_REQUEST_ID));
throw ex;
} else {
String json = IOUtils.readStreamAsString(conn.getInputStream());
JSONObject tree = JSON.parseObject(json);
String node = tree.getString("PackId");
if (node != null) {
SeekPackResult startPack = new SeekPackResult(node);
return startPack;
} else {
throw new DatahubException("get pack id fail");
}
}
} catch (DatahubException e) {
throw e;
} catch (Exception e) {
throw new DatahubException(e.getMessage(), e);
}
}
private boolean getPack(String fetchMode) throws OdpsException, IOException {
this.protobufRecordStreamReader = null;
HashMap params = new HashMap(this.params);
HashMap headers = new HashMap(this.headers);
try {
String strMode;
if (this.readMode.equals(PackType.ReadMode.SEEK_NEXT)) {
strMode = DatahubConstants.ITER_MODE_AFTER_PACKID;
} else {
strMode = DatahubConstants.ITER_MODE_AT_PACKID;
}
params.put(DatahubConstants.PACK_ID, this.nextPackId);
params.put(DatahubConstants.ITERATE_MODE, strMode);
params.put(DatahubConstants.PACK_NUM, "1");
params.put(DatahubConstants.PACK_FETCHMODE, fetchMode);
//headers.put(Headers.CONTENT_MD5, generatorMD5(bytes));
Connection conn = datahubServiceClient.connect(path, "GET", params, headers);
Response resp = conn.getResponse();
if (!resp.isOK()) {
DatahubException ex = new DatahubException(conn.getInputStream());
ex.setRequestId(resp.getHeader(DatahubHttpHeaders.HEADER_ODPS_REQUEST_ID));
throw ex;
}
String num = resp.getHeader(DatahubHttpHeaders.HEADER_ODPS_PACK_NUM);
if (num.equals("0")) {
return false;
}
InputStream in = conn.getInputStream();
byte[] bytes = IOUtils.readFully(in);
XStreamPack pack = XStreamPack.parseFrom(bytes);
if (fetchMode.equals("all")) {
bytes = pack.getPackData().toByteArray();
this.protobufRecordStreamReader = new ProtobufRecordStreamReader(
tableSchema, new ByteArrayInputStream(bytes));
}
if (pack.hasPackMeta()) {
this.packMeta = pack.getPackMeta().toByteArray();
} else {
this.packMeta = "".getBytes();
}
String npid = resp.getHeader(DatahubHttpHeaders.HEADER_ODPS_NEXT_PACKID);
this.lastPackId = resp.getHeader(DatahubHttpHeaders.HEADER_ODPS_CURRENT_PACKID);
if (!npid.equals(PackType.LAST_PACK_ID)) {
this.nextPackId = npid;
this.readMode = PackType.ReadMode.SEEK_CUR;
} else {
this.nextPackId = lastPackId;
this.readMode = PackType.ReadMode.SEEK_NEXT;
}
return true;
} catch (DatahubException e) {
throw e;
} catch (Exception e) {
throw new DatahubException(e.getMessage(), e);
}
}
public byte[] readMeta() throws OdpsException, IOException {
if (getPack("meta")) {
return this.packMeta;
}
return null;
}
private String generatorMD5(byte[] bytes) {
byte[] digest = messageDigest.digest(bytes);
StringBuilder sb = new StringBuilder();
for (byte b : digest) {
sb.append(String.format("%02X", b));
}
return sb.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy