public class SQLContext extends Object implements Logging, scala.Serializable
SchemaRDD
objects and the execution of SQL queries.
Modifier and Type | Class and Description |
---|---|
static class |
org.apache.spark.sql.SQLConf.Deprecated$ |
Constructor and Description |
---|
SQLContext(SparkContext sparkContext) |
Modifier and Type | Method and Description |
---|---|
SchemaRDD |
applySchema(RDD<org.apache.spark.sql.catalyst.expressions.Row> rowRDD,
org.apache.spark.sql.catalyst.types.StructType schema)
:: DeveloperApi ::
Creates a
SchemaRDD from an RDD containing Row s by applying a schema to this RDD. |
String |
AUTO_BROADCASTJOIN_THRESHOLD() |
int |
autoBroadcastJoinThreshold()
Upper bound on the sizes (in bytes) of the tables qualified for the auto conversion to
a broadcast value during the physical executions of join operations.
|
void |
cacheTable(String tableName)
Caches the specified table in-memory.
|
void |
clear() |
String |
CODEGEN_ENABLED() |
boolean |
codegenEnabled()
When set to true, Spark SQL will use the Scala compiler at runtime to generate custom bytecode
that evaluates expressions found in queries.
|
String |
COLUMN_BATCH_SIZE() |
int |
columnBatchSize()
The number of rows that will be
|
String |
COMPRESS_CACHED() |
<A extends scala.Product> |
createParquetFile(String path,
boolean allowExisting,
org.apache.hadoop.conf.Configuration conf,
scala.reflect.api.TypeTags.TypeTag<A> evidence$2)
:: Experimental ::
Creates an empty parquet file with the schema of class
A , which can be registered as a table. |
<A extends scala.Product> |
createSchemaRDD(RDD<A> rdd,
scala.reflect.api.TypeTags.TypeTag<A> evidence$1)
Creates a SchemaRDD from an RDD of case classes.
|
String |
DEFAULT_SIZE_IN_BYTES() |
long |
defaultSizeInBytes()
The default size in bytes to assign to a logical operator's estimation statistics.
|
String |
dialect()
The SQL dialect that is used when parsing queries.
|
String |
DIALECT() |
scala.collection.immutable.Map<String,String> |
getAllConfs()
Return all the configuration properties that have been set (i.e.
|
String |
getConf(String key)
Return the value of Spark SQL configuration property for the given key.
|
String |
getConf(String key,
String defaultValue)
Return the value of Spark SQL configuration property for the given key.
|
boolean |
isCached(String tableName)
Returns true if the table is currently cached in-memory.
|
boolean |
isParquetBinaryAsString()
When set to true, we always treat byte arrays in Parquet files as strings.
|
SchemaRDD |
jsonFile(String path)
Loads a JSON file (one object per line), returning the result as a
SchemaRDD . |
SchemaRDD |
jsonFile(String path,
double samplingRatio)
:: Experimental ::
|
SchemaRDD |
jsonFile(String path,
org.apache.spark.sql.catalyst.types.StructType schema)
:: Experimental ::
Loads a JSON file (one object per line) and applies the given schema,
returning the result as a
SchemaRDD . |
SchemaRDD |
jsonRDD(RDD<String> json)
Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
SchemaRDD . |
SchemaRDD |
jsonRDD(RDD<String> json,
double samplingRatio)
:: Experimental ::
|
SchemaRDD |
jsonRDD(RDD<String> json,
org.apache.spark.sql.catalyst.types.StructType schema)
:: Experimental ::
Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
returning the result as a
SchemaRDD . |
SchemaRDD |
logicalPlanToSparkQuery(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan plan)
:: DeveloperApi ::
Allows catalyst LogicalPlans to be executed as a SchemaRDD.
|
int |
numShufflePartitions()
Number of partitions to use for shuffle operators.
|
String |
PARQUET_BINARY_AS_STRING() |
String |
PARQUET_CACHE_METADATA() |
String |
PARQUET_COMPRESSION() |
String |
parquetCompressionCodec()
The compression codec for writing to a Parquetfile
|
SchemaRDD |
parquetFile(String path)
Loads a Parquet file, returning the result as a
SchemaRDD . |
<T> void |
registerFunction(String name,
scala.Function1<?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$1)
registerFunction 1-22 were generated by this script
|
<T> void |
registerFunction(String name,
scala.Function10<?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$10) |
<T> void |
registerFunction(String name,
scala.Function11<?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$11) |
<T> void |
registerFunction(String name,
scala.Function12<?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$12) |
<T> void |
registerFunction(String name,
scala.Function13<?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$13) |
<T> void |
registerFunction(String name,
scala.Function14<?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$14) |
<T> void |
registerFunction(String name,
scala.Function15<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$15) |
<T> void |
registerFunction(String name,
scala.Function16<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$16) |
<T> void |
registerFunction(String name,
scala.Function17<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$17) |
<T> void |
registerFunction(String name,
scala.Function18<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$18) |
<T> void |
registerFunction(String name,
scala.Function19<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$19) |
<T> void |
registerFunction(String name,
scala.Function2<?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$2) |
<T> void |
registerFunction(String name,
scala.Function20<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$20) |
<T> void |
registerFunction(String name,
scala.Function21<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$21) |
<T> void |
registerFunction(String name,
scala.Function22<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$22) |
<T> void |
registerFunction(String name,
scala.Function3<?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$3) |
<T> void |
registerFunction(String name,
scala.Function4<?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$4) |
<T> void |
registerFunction(String name,
scala.Function5<?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$5) |
<T> void |
registerFunction(String name,
scala.Function6<?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$6) |
<T> void |
registerFunction(String name,
scala.Function7<?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$7) |
<T> void |
registerFunction(String name,
scala.Function8<?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$8) |
<T> void |
registerFunction(String name,
scala.Function9<?,?,?,?,?,?,?,?,?,T> func,
scala.reflect.api.TypeTags.TypeTag<T> evidence$9) |
void |
registerPython(String name,
byte[] command,
java.util.Map<String,String> envVars,
java.util.List<String> pythonIncludes,
String pythonExec,
Accumulator<java.util.List<byte[]>> accumulator,
String stringDataType) |
void |
registerRDDAsTable(SchemaRDD rdd,
String tableName)
Registers the given RDD as a temporary table in the catalog.
|
void |
setConf(java.util.Properties props)
Set Spark SQL configuration properties.
|
void |
setConf(String key,
String value)
Set the given Spark SQL configuration property.
|
java.util.Map<String,String> |
settings()
Only low degree of contention is expected for conf, thus NOT using ConcurrentHashMap.
|
String |
SHUFFLE_PARTITIONS() |
SparkContext |
sparkContext() |
SchemaRDD |
sql(String sqlText)
Executes a SQL query using Spark, returning the result as a SchemaRDD.
|
SchemaRDD |
table(String tableName)
Returns the specified table as a SchemaRDD
|
String |
THRIFTSERVER_POOL() |
void |
uncacheTable(String tableName)
Removes the specified table from the in-memory cache.
|
boolean |
useCompression()
When true tables cached using the in-memory columnar caching will be compressed.
|
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
initialized, initializeIfNecessary, initializeLogging, initLock, isTraceEnabled, log_, log, logDebug, logDebug, logError, logError, logInfo, logInfo, logName, logTrace, logTrace, logWarning, logWarning
public SQLContext(SparkContext sparkContext)
public SparkContext sparkContext()
public SchemaRDD logicalPlanToSparkQuery(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan plan)
public <A extends scala.Product> SchemaRDD createSchemaRDD(RDD<A> rdd, scala.reflect.api.TypeTags.TypeTag<A> evidence$1)
public SchemaRDD applySchema(RDD<org.apache.spark.sql.catalyst.expressions.Row> rowRDD, org.apache.spark.sql.catalyst.types.StructType schema)
SchemaRDD
from an RDD
containing Row
s by applying a schema to this RDD.
It is important to make sure that the structure of every Row
of the provided RDD matches
the provided schema. Otherwise, there will be runtime exception.
Example:
import org.apache.spark.sql._
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val schema =
StructType(
StructField("name", StringType, false) ::
StructField("age", IntegerType, true) :: Nil)
val people =
sc.textFile("examples/src/main/resources/people.txt").map(
_.split(",")).map(p => Row(p(0), p(1).trim.toInt))
val peopleSchemaRDD = sqlContext. applySchema(people, schema)
peopleSchemaRDD.printSchema
// root
// |-- name: string (nullable = false)
// |-- age: integer (nullable = true)
peopleSchemaRDD.registerTempTable("people")
sqlContext.sql("select name from people").collect.foreach(println)
public SchemaRDD parquetFile(String path)
SchemaRDD
.
public SchemaRDD jsonFile(String path)
SchemaRDD
.
It goes through the entire dataset once to determine the schema.
public SchemaRDD jsonFile(String path, org.apache.spark.sql.catalyst.types.StructType schema)
SchemaRDD
.
public SchemaRDD jsonFile(String path, double samplingRatio)
public SchemaRDD jsonRDD(RDD<String> json)
SchemaRDD
.
It goes through the entire dataset once to determine the schema.
public SchemaRDD jsonRDD(RDD<String> json, org.apache.spark.sql.catalyst.types.StructType schema)
SchemaRDD
.
public <A extends scala.Product> SchemaRDD createParquetFile(String path, boolean allowExisting, org.apache.hadoop.conf.Configuration conf, scala.reflect.api.TypeTags.TypeTag<A> evidence$2)
A
, which can be registered as a table.
This registered table can be used as the target of future insertInto
operations.
val sqlContext = new SQLContext(...)
import sqlContext._
case class Person(name: String, age: Int)
createParquetFile[Person]("path/to/file.parquet").registerTempTable("people")
sql("INSERT INTO people SELECT 'michael', 29")
path
- The path where the directory containing parquet metadata should be created.
Data inserted into this table will also be stored at this location.allowExisting
- When false, an exception will be thrown if this directory already exists.conf
- A Hadoop configuration object that can be used to specify options to the parquet
output format.
public void registerRDDAsTable(SchemaRDD rdd, String tableName)
public SchemaRDD sql(String sqlText)
public SchemaRDD table(String tableName)
public void cacheTable(String tableName)
public void uncacheTable(String tableName)
public boolean isCached(String tableName)
public String COMPRESS_CACHED()
public String COLUMN_BATCH_SIZE()
public String AUTO_BROADCASTJOIN_THRESHOLD()
public String DEFAULT_SIZE_IN_BYTES()
public String SHUFFLE_PARTITIONS()
public String CODEGEN_ENABLED()
public String DIALECT()
public String PARQUET_BINARY_AS_STRING()
public String PARQUET_CACHE_METADATA()
public String PARQUET_COMPRESSION()
public String THRIFTSERVER_POOL()
public java.util.Map<String,String> settings()
public String dialect()
When using a HiveContext, this value defaults to 'hiveql', which uses the Hive 0.12.0 HiveQL parser. Users can change this to 'sql' if they want to run queries that aren't supported by HiveQL (e.g., SELECT 1).
Note that the choice of dialect does not affect things like what tables are available or how query execution is performed.
public boolean useCompression()
public String parquetCompressionCodec()
public int columnBatchSize()
public int numShufflePartitions()
public boolean codegenEnabled()
Defaults to false as this feature is currently experimental.
public int autoBroadcastJoinThreshold()
Hive setting: hive.auto.convert.join.noconditionaltask.size, whose default value is also 10000.
public long defaultSizeInBytes()
autoBroadcastJoinThreshold
, hence any logical operator
without a properly implemented estimation of this statistic will not be incorrectly broadcasted
in joins.public boolean isParquetBinaryAsString()
public void setConf(java.util.Properties props)
public void setConf(String key, String value)
public String getConf(String key)
public String getConf(String key, String defaultValue)
defaultValue
.public scala.collection.immutable.Map<String,String> getAllConfs()
public void clear()
public void registerPython(String name, byte[] command, java.util.Map<String,String> envVars, java.util.List<String> pythonIncludes, String pythonExec, Accumulator<java.util.List<byte[]>> accumulator, String stringDataType)
public <T> void registerFunction(String name, scala.Function1<?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$1)
(1 to 22).map { x => val types = (1 to x).map(x => "_").reduce(_ + ", " + _) s""" def registerFunction[T: TypeTag](name: String, func: Function$x[$types, T]): Unit = { def builder(e: Seq[Expression]) = ScalaUdf(func, ScalaReflection.schemaFor(typeTag[T]).dataType, e) functionRegistry.registerFunction(name, builder) } """ }
public <T> void registerFunction(String name, scala.Function2<?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$2)
public <T> void registerFunction(String name, scala.Function3<?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$3)
public <T> void registerFunction(String name, scala.Function4<?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$4)
public <T> void registerFunction(String name, scala.Function5<?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$5)
public <T> void registerFunction(String name, scala.Function6<?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$6)
public <T> void registerFunction(String name, scala.Function7<?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$7)
public <T> void registerFunction(String name, scala.Function8<?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$8)
public <T> void registerFunction(String name, scala.Function9<?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$9)
public <T> void registerFunction(String name, scala.Function10<?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$10)
public <T> void registerFunction(String name, scala.Function11<?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$11)
public <T> void registerFunction(String name, scala.Function12<?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$12)
public <T> void registerFunction(String name, scala.Function13<?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$13)
public <T> void registerFunction(String name, scala.Function14<?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$14)
public <T> void registerFunction(String name, scala.Function15<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$15)
public <T> void registerFunction(String name, scala.Function16<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$16)
public <T> void registerFunction(String name, scala.Function17<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$17)
public <T> void registerFunction(String name, scala.Function18<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$18)
public <T> void registerFunction(String name, scala.Function19<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$19)
public <T> void registerFunction(String name, scala.Function20<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$20)
public <T> void registerFunction(String name, scala.Function21<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$21)
public <T> void registerFunction(String name, scala.Function22<?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,T> func, scala.reflect.api.TypeTags.TypeTag<T> evidence$22)