com.aliyun.odps.task.SQLTask Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.aliyun.odps.task;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.aliyun.odps.Instance;
import com.aliyun.odps.Odps;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.Project;
import com.aliyun.odps.Quota;
import com.aliyun.odps.Survey;
import com.aliyun.odps.TableSchema;
import com.aliyun.odps.Task;
import com.aliyun.odps.commons.util.EmptyIterator;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.ResultSet;
import com.aliyun.odps.rest.SimpleXmlUtils;
import com.aliyun.odps.simpleframework.xml.Element;
import com.aliyun.odps.simpleframework.xml.Root;
import com.aliyun.odps.simpleframework.xml.convert.Convert;
import com.aliyun.odps.tunnel.InstanceTunnel;
import com.aliyun.odps.tunnel.TunnelException;
import com.aliyun.odps.tunnel.io.TunnelRecordReader;
import com.aliyun.odps.utils.CSVRecordParser;
import com.aliyun.odps.utils.ColumnUtils;
import com.aliyun.odps.utils.OdpsConstants;
import com.aliyun.odps.utils.StringUtils;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParseException;
import com.google.gson.JsonParser;
import com.google.gson.reflect.TypeToken;
/**
* SQLTask的定义
*
* @author [email protected]
*/
@Root(name = "SQL", strict = false)
public class SQLTask extends Task {
@Element(name = "Query", required = false)
@Convert(SimpleXmlUtils.EmptyStringConverter.class)
private String query;
private static Map defaultHints;
private static final String AnonymousSQLTaskName = "AnonymousSQLTask";
private static final String AnonymousLineageTask = "AnonymousLineageTask";
public String getQuery() {
return query;
}
public static void setDefaultHints(Map hints){
SQLTask.defaultHints = hints;
}
public static void removeDefaultHints(){
SQLTask.defaultHints = null;
}
/**
* 设置SQL查询语句
*
* @param query
* 需要执行的SQL查询
*/
public void setQuery(String query) {
this.query = query;
}
/**
* 解析 CSV 格式的数据字符串,转换为 record 列表
* @param csvResult 数据字符串
* @return 记录列表
*
* @throws OdpsException
*/
public static List parseCsvRecord(String csvResult) throws OdpsException {
return CSVRecordParser.parse(csvResult).getRecords();
}
/**
* Return 1W records at most.
*
* Columns of each record in result are ALL OdpsType.STIRNG, ignore their real type in odps
*
* Return value is valid only when SQL query is select, otherwise, result will be empty.
*
* This API is used when SQLTask Instance is created with specific task name.
*
*
* Example:
*
* {
* String taskName = "test_select_sql_task";
* Instance i = SQLTask.run(odps, odps.getDefaultProject(),
* "select * from test_select_sql_result;",
* taskName, null, null, 3);
* instance.waitForSuccess();
* List records = SQLTask.getResult(i, taskName);
* }
*
*
* @param instance
* @return
* @throws OdpsException
*
* @see {@link #getResultByInstanceTunnel(Instance, String, Long)}
*/
public static List getResult(Instance instance, String taskName) throws OdpsException {
Map results = instance.getTaskResults();
String selectResult = results.get(taskName);
if (selectResult != null) {
return parseCsvRecord(selectResult);
}
return null;
}
/**
* Return 1W records with completed schema info at most. No data size limit.
*
* Return value is valid only when SQL query is select, otherwise, result will be empty.
*
* This API is used when SQLTask Instance is created with specific task name.
*
*
* Example:
*
* {
* String taskName = "test_select_sql_task";
* Instance i = SQLTask.run(odps, odps.getDefaultProject(),
* "select * from test_select_sql_result;",
* taskName, null, null, 3);
* instance.waitForSuccess();
* List records = SQLTask.getResultByInstanceTunnel(i, taskName, 1000L);
* }
*
*
*
* 使用 instance tunnel 的方式获取 task 结果.
*
* 本接口与 {@link #getResult(Instance, String)} 略有不同:
* 1) 本接口返回的 record 带有完整 schema 信息;
* 2) 本接口返回的 record 结果集有 1W 的条数限制,但不再有 10M 的大小限制
*
* 注意 : 返回结果类型为 {@link List}, 数据量较大时会带来较多内存开销,
* 大数据量下载建议直接使用 {@see InstanceTunnel} 或 {@link #getResultSet(Instance, String)};
*
* @param instance
* instance 对象
* @param taskName
* task 名称
* @param limit
* 获取结果的数量
* @return record list
* @throws OdpsException,
* IOException
*/
@Survey
public static List getResultByInstanceTunnel(Instance instance, String taskName,
Long limit)
throws OdpsException, IOException {
return getResultByInstanceTunnel(instance, taskName, limit, true);
}
public static List getResultByInstanceTunnel(Instance instance, String taskName,
Long limit, boolean limitEnabled)
throws OdpsException, IOException {
checkTaskName(instance, taskName);
InstanceTunnel tunnel = new InstanceTunnel(instance.getOdps());
InstanceTunnel.DownloadSession session =
tunnel.createDownloadSession(instance.getProject(), instance.getId(), limitEnabled);
long recordCount = session.getRecordCount();
List records = new ArrayList();
if (recordCount == 0) {
return records;
}
if (limit != null && limit < recordCount) {
recordCount = limit;
}
TunnelRecordReader reader = session.openRecordReader(0, recordCount);
Record record;
while ((record = reader.read()) != null) {
records.add(record);
}
return records;
}
/**
* 使用 instance tunnel 的方式获取 Anonymous task 的结果
*
* @param instance
* instance 对象
* @param limit
* 获取结果的数量
* @return record list
* @throws OdpsException,
* IOException
*
* @see #getResultByInstanceTunnel(Instance, String, Long)
*/
@Survey
public static List getResultByInstanceTunnel(Instance instance, Long limit)
throws OdpsException, IOException {
return getResultByInstanceTunnel(instance, AnonymousSQLTaskName, limit);
}
/**
* 使用 instance tunnel 的方式获取 instance 结果
*
* 当数据量不大于 {@link #SQL_TASK_RESULT_LIMIT} 条时,成功;
* 若数据量量大于 {@link #SQL_TASK_RESULT_LIMIT} 条, 返回 SQL_TASK_RESULT_LIMIT 条结果;
*
* 可使用 {@link #getResultByInstanceTunnel(Instance, String, Long)} 接口调整 limit 参数, 来获取更多数据;
*
* @param instance
* instance 对象
* @param taskName
* task 名称
* @return record list
* @throws OdpsException,
* IOException
*/
@Survey
public static List getResultByInstanceTunnel(Instance instance, String taskName)
throws OdpsException, IOException {
return getResultByInstanceTunnel(instance, taskName, null);
}
/**
* 使用 instance tunnel 的方式获取 Anonymous task 的结果
*
* @param instance
* instance 对象
*
* @return record list
* @throws OdpsException,
* IOException
*
* @see #getResultByInstanceTunnel(Instance, String)
*/
@Survey
public static List getResultByInstanceTunnel(Instance instance)
throws OdpsException, IOException {
return getResultByInstanceTunnel(instance, AnonymousSQLTaskName);
}
/**
* Return 1W records at most.
*
* Columns of each record in result are ALL OdpsType.STIRNG, ignore their real type in odps
* Return value is valid only when SQL query is select, otherwise, result will be empty.
* Default task name 'AnonymousSQLTask' will be used.
*
*
* Example:
*
* {
* Instance i = SQLTask.run(odps, "select * from test_select_sql_result;");
* instance.waitForSuccess();
* List records = SQLTask.getResult(i);
* }
*
*
* @param instance
* @return
* @throws OdpsException
*/
public static List getResult(Instance instance) throws OdpsException {
return getResult(instance, AnonymousSQLTaskName);
}
/**
* 通过instance获取记录迭代器,从而可以让用户通过迭代器逐条获取记录来避免一次性获取全量数据到本地时撑爆内存的问题
*
* 注:本接口使用 instance tunnel 实现,没有记录数限制,可获取instance对应query结果集的全量数据。
* 但是只有instance的owner本人可以使用本接口,且当对应project打开protection时,
* 需要提前在policy中为对应SQL中涉及的相应表和视图添加exception,否则无权下载
*
* @param instance
* @return
* @throws OdpsException
*/
public static ResultSet getResultSet(Instance instance) throws OdpsException, IOException {
return getResultSet(instance, AnonymousSQLTaskName);
}
/**
* 通过instance获取记录迭代器,从而可以让用户通过迭代器逐条获取记录来避免一次性获取全量数据到本地时撑爆内存的问题
*
* 注:本接口使用 instance tunnel 实现,没有记录数限制,可获取instance对应query结果集的全量数据。
* 但是只有instance的owner本人可以使用本接口,且当对应project打开protection时,
* 需要提前在policy中为对应SQL中涉及的相应表和视图添加exception,否则无权下载
*
* @param instance
* @param taskName
* @return
* @throws OdpsException
*/
public static ResultSet getResultSet(Instance instance, String taskName)
throws OdpsException, IOException {
return getResultSet(instance, taskName, null);
}
/**
* 通过instance获取记录迭代器,从而可以让用户通过迭代器逐条获取记录来避免一次性获取全量数据到本地时撑爆内存的问题
*
* 注:本接口使用 instance tunnel 实现,没有记录数限制,可获取instance对应query结果集的全量数据。
* 但是只有instance的owner本人可以使用本接口,且当对应project打开protection时,
* 需要提前在policy中为对应SQL中涉及的相应表和视图添加exception,否则无权下载
*
* @param instance
* @param limit
* @return
* @throws OdpsException
*/
public static ResultSet getResultSet(Instance instance, Long limit) throws OdpsException, IOException {
return getResultSet(instance, AnonymousSQLTaskName, limit);
}
/**
* 通过instance获取记录迭代器,从而可以让用户通过迭代器逐条获取记录来避免一次性获取全量数据到本地时撑爆内存的问题
*
* 注:本接口使用 instance tunnel 实现,没有记录数限制,可获取instance对应query结果集的全量数据。
* 但是只有instance的owner本人可以使用本接口,且当对应project打开protection时,
* 需要提前在policy中为对应SQL中涉及的相应表和视图添加exception,否则无权下载
*
* @param instance
* @param taskName
* @param limit
* @return
* @throws OdpsException
*/
public static ResultSet getResultSet(Instance instance, String taskName, Long limit)
throws OdpsException {
return getResultSet(instance, taskName, limit, false);
}
/**
* 通过instance获取记录迭代器,从而可以让用户通过迭代器逐条获取记录来避免一次性获取全量数据到本地时撑爆内存的问题
*
* 注:
* 1.只有instance的owner本人可以使用本接口
* 2.当limitHint为true时,结果最多只能获得1条记录,超过将截断,但无需进行逐表的权限检查
* 3.当limitHint为false时,没有记录数限制,可获取instance对应query结果集的全量数据。但前提是需要逐表(SQL中
* 涉及的表与视图)对用户进行权限检查,所以当查询涉及表所在project打开protection时,需要提前在policy中为相应表
* 和视图添加exception,否则无权下载
*
* @param instance
* @param taskName
* @param limit
* @param limitHint
* @return
* @throws OdpsException
*/
public static ResultSet getResultSet(Instance instance, String taskName, Long limit, boolean limitHint)
throws OdpsException {
return getResultSet(instance, taskName, limit, limitHint, null);
}
/**
* 通过instance获取记录迭代器,从而可以让用户通过迭代器逐条获取记录来避免一次性获取全量数据到本地时撑爆内存的问题
*
* 注:本接口使用 instance tunnel 实现,没有记录数限制,可获取instance对应query结果集的全量数据。
*
* 1.只有instance的owner本人可以使用本接口
* 2.当limitHint为true时,结果最多只能获得1条记录,超过将截断,但无需进行逐表的权限检查
* 3.当limitHint为false时,没有记录数限制,可获取instance对应query结果集的全量数据。但前提是需要逐表(SQL中
* 涉及的表与视图)对用户进行权限检查,所以当查询涉及表所在project打开protection时,需要提前在policy中为相应表
* 和视图添加exception,否则无权下载
*
* @param instance
* @param taskName
* @param limit
* @param limitHint
* @param tunnelEndpoint 指定 tunnel endpoint
* @return
* @throws OdpsException
*/
public static ResultSet getResultSet(Instance instance, String taskName, Long limit,
boolean limitHint, URI tunnelEndpoint)
throws OdpsException {
checkTaskName(instance, taskName);
InstanceTunnel tunnel = new InstanceTunnel(instance.getOdps());
if (tunnelEndpoint != null) {
tunnel.setEndpoint(tunnelEndpoint.toString());
}
InstanceTunnel.DownloadSession session =
tunnel.createDownloadSession(instance.getProject(), instance.getId(), limitHint);
long recordCount = session.getRecordCount();
if (recordCount == 0) {
return new ResultSet(EmptyIterator.emptyIterator(), session.getSchema(), recordCount);
}
if (limit != null && limit < recordCount) {
recordCount = limit;
}
return new ResultSet(new RecordSetIterator(session, recordCount), session.getSchema(), recordCount);
}
public static ResultSet getResultSet(Instance instance, String taskName,
InstanceTunnel instanceTunnel, Long limit, boolean limitHint)
throws OdpsException {
checkTaskName(instance, taskName);
InstanceTunnel.DownloadSession session =
instanceTunnel.createDownloadSession(instance.getProject(), instance.getId(), limitHint);
long recordCount = session.getRecordCount();
if (recordCount == 0) {
return new ResultSet(EmptyIterator.emptyIterator(), session.getSchema(), recordCount);
}
if (limit != null && limit < recordCount) {
recordCount = limit;
}
return new ResultSet(new RecordSetIterator(session, recordCount), session.getSchema(),
recordCount);
}
private static void checkTaskName(Instance instance, String taskName) throws OdpsException {
if (StringUtils.isNullOrEmpty(taskName)) {
throw new OdpsException("Invalid task name.");
}
boolean findTask = false;
for (String n : instance.getTaskNames()) {
if (taskName.equals(n)) {
findTask = true;
break;
}
}
if (!findTask) {
throw new OdpsException("Invalid task: " + taskName);
}
}
@Override
public String getCommandText() {
return query;
}
/**
* 获取运行 SQL 的编译警告信息,可能有多条.
*
* @param instance
* instance 对象
* @return 编译警告信息
* @throws OdpsException
*/
public static List getSqlWarning(Instance instance) throws OdpsException {
return getSqlWarning(instance, AnonymousSQLTaskName);
}
/**
* 获取运行 SQL 的编译警告信息,可能有多条.
*
* @param instance
* instance 对象
* @param taskName
* task 名称
* @return 编译警告信息
* @throws OdpsException
*/
public static List getSqlWarning(Instance instance, String taskName) throws OdpsException {
String warnings = instance.getTaskInfo(taskName, "warnings");
try {
List warningList = new LinkedList();
JsonObject jsonObject = new JsonParser().parse(warnings).getAsJsonObject();
if (!jsonObject.has("warnings")) {
return null;
}
JsonArray array = jsonObject.get("warnings").getAsJsonArray();
for (JsonElement element : array) {
warningList.add(element.getAsString());
}
return warningList;
} catch (JsonParseException e) {
return null;
}
}
/**
* 获取运行 SQL 运行结果的 schema 信息。
*
* @param instance
* {@link Instance} 对象
*
* @return 运行结果的 schema
*
*/
public static TableSchema getResultSchema(Instance instance) throws OdpsException {
// Create a LineageTask with the same context of the original SQLTask
Gson gson = new GsonBuilder().disableHtmlEscaping().create();
SQLTask sqlTask;
if (instance.getTasks() == null
|| instance.getTasks().isEmpty()
|| !(instance.getTasks().get(0) instanceof SQLTask)) {
throw new IllegalArgumentException("Not a SQLTask instance");
}
sqlTask = (SQLTask) instance.getTasks().get(0);
Map taskProperties = sqlTask.getProperties();
Map settings = new HashMap<>();
if (taskProperties.containsKey("settings")) {
settings.putAll(
gson.fromJson(
taskProperties.get("settings"), new TypeToken
© 2015 - 2025 Weber Informatics LLC | Privacy Policy