public class HadoopRDD<K,V> extends RDD<scala.Tuple2<K,V>> implements Logging
org.apache.hadoop.mapred
).
Note: Instantiating this class directly is not recommended, please use
org.apache.spark.SparkContext.hadoopRDD()
Modifier and Type | Class and Description |
---|---|
static class |
HadoopRDD.HadoopMapPartitionsWithSplitRDD<U,T>
Analogous to
MapPartitionsRDD , but passes in an InputSplit to
the given function rather than the index of the partition. |
static class |
HadoopRDD.HadoopMapPartitionsWithSplitRDD$ |
static class |
HadoopRDD.SplitInfoReflections |
Constructor and Description |
---|
HadoopRDD(SparkContext sc,
Broadcast<SerializableWritable<org.apache.hadoop.conf.Configuration>> broadcastedConf,
scala.Option<scala.Function1<org.apache.hadoop.mapred.JobConf,scala.runtime.BoxedUnit>> initLocalJobConfFuncOpt,
Class<? extends org.apache.hadoop.mapred.InputFormat<K,V>> inputFormatClass,
Class<K> keyClass,
Class<V> valueClass,
int minPartitions) |
HadoopRDD(SparkContext sc,
org.apache.hadoop.mapred.JobConf conf,
Class<? extends org.apache.hadoop.mapred.InputFormat<K,V>> inputFormatClass,
Class<K> keyClass,
Class<V> valueClass,
int minPartitions) |
Modifier and Type | Method and Description |
---|---|
static void |
addLocalConfiguration(String jobTrackerId,
int jobId,
int splitId,
int attemptId,
org.apache.hadoop.mapred.JobConf conf)
Add Hadoop configuration specific to a single partition and attempt.
|
void |
checkpoint()
Mark this RDD for checkpointing.
|
InterruptibleIterator<scala.Tuple2<K,V>> |
compute(Partition theSplit,
TaskContext context)
:: DeveloperApi ::
Implemented by subclasses to compute a given partition.
|
static Object |
CONFIGURATION_INSTANTIATION_LOCK()
Configuration's constructor is not threadsafe (see SPARK-1097 and HADOOP-10456).
|
static boolean |
containsCachedMetadata(String key) |
static scala.collection.Seq<String> |
convertSplitLocationInfo(Object[] infos) |
static Object |
getCachedMetadata(String key)
The three methods below are helpers for accessing the local map, a property of the SparkEnv of
the local process.
|
org.apache.hadoop.conf.Configuration |
getConf() |
Partition[] |
getPartitions()
Implemented by subclasses to return the set of partitions in this RDD.
|
scala.collection.Seq<String> |
getPreferredLocations(Partition split)
Optionally overridden by subclasses to specify placement preferences.
|
<U> RDD<U> |
mapPartitionsWithInputSplit(scala.Function2<org.apache.hadoop.mapred.InputSplit,scala.collection.Iterator<scala.Tuple2<K,V>>,scala.collection.Iterator<U>> f,
boolean preservesPartitioning,
scala.reflect.ClassTag<U> evidence$1)
Maps over a partition, providing the InputSplit that was used as the base of the partition.
|
static Object |
putCachedMetadata(String key,
Object value) |
static int |
RECORDS_BETWEEN_BYTES_READ_METRIC_UPDATES()
Update the input bytes read metric each time this number of records has been read
|
static scala.Option<HadoopRDD.SplitInfoReflections> |
SPLIT_INFO_REFLECTIONS() |
aggregate, cache, cartesian, checkpointData, coalesce, collect, collect, collectPartitions, computeOrReadCheckpoint, conf, context, count, countApprox, countApproxDistinct, countApproxDistinct, countByValue, countByValueApprox, creationSite, dependencies, distinct, distinct, doCheckpoint, elementClassTag, filter, filterWith, first, flatMap, flatMapWith, fold, foreach, foreachPartition, foreachWith, getCheckpointFile, getCreationSite, getNarrowAncestors, getStorageLevel, glom, groupBy, groupBy, groupBy, id, intersection, intersection, intersection, isCheckpointed, iterator, keyBy, map, mapPartitions, mapPartitionsWithContext, mapPartitionsWithIndex, mapPartitionsWithSplit, mapWith, markCheckpointed, max, min, name, partitioner, partitions, persist, persist, pipe, pipe, pipe, preferredLocations, randomSplit, reduce, repartition, retag, retag, sample, saveAsObjectFile, saveAsTextFile, saveAsTextFile, setName, sortBy, sparkContext, subtract, subtract, subtract, take, takeOrdered, takeSample, toArray, toDebugString, toJavaRDD, toLocalIterator, top, toString, union, unpersist, zip, zipPartitions, zipPartitions, zipPartitions, zipPartitions, zipPartitions, zipPartitions, zipWithIndex, zipWithUniqueId
initializeIfNecessary, initializeLogging, isTraceEnabled, log_, log, logDebug, logDebug, logError, logError, logInfo, logInfo, logName, logTrace, logTrace, logWarning, logWarning
public HadoopRDD(SparkContext sc, Broadcast<SerializableWritable<org.apache.hadoop.conf.Configuration>> broadcastedConf, scala.Option<scala.Function1<org.apache.hadoop.mapred.JobConf,scala.runtime.BoxedUnit>> initLocalJobConfFuncOpt, Class<? extends org.apache.hadoop.mapred.InputFormat<K,V>> inputFormatClass, Class<K> keyClass, Class<V> valueClass, int minPartitions)
public HadoopRDD(SparkContext sc, org.apache.hadoop.mapred.JobConf conf, Class<? extends org.apache.hadoop.mapred.InputFormat<K,V>> inputFormatClass, Class<K> keyClass, Class<V> valueClass, int minPartitions)
public static Object CONFIGURATION_INSTANTIATION_LOCK()
public static int RECORDS_BETWEEN_BYTES_READ_METRIC_UPDATES()
public static Object getCachedMetadata(String key)
public static boolean containsCachedMetadata(String key)
public static Object putCachedMetadata(String key, Object value)
public static void addLocalConfiguration(String jobTrackerId, int jobId, int splitId, int attemptId, org.apache.hadoop.mapred.JobConf conf)
public static scala.Option<HadoopRDD.SplitInfoReflections> SPLIT_INFO_REFLECTIONS()
public static scala.collection.Seq<String> convertSplitLocationInfo(Object[] infos)
public Partition[] getPartitions()
RDD
public InterruptibleIterator<scala.Tuple2<K,V>> compute(Partition theSplit, TaskContext context)
RDD
public <U> RDD<U> mapPartitionsWithInputSplit(scala.Function2<org.apache.hadoop.mapred.InputSplit,scala.collection.Iterator<scala.Tuple2<K,V>>,scala.collection.Iterator<U>> f, boolean preservesPartitioning, scala.reflect.ClassTag<U> evidence$1)
public scala.collection.Seq<String> getPreferredLocations(Partition split)
RDD
public void checkpoint()
RDD
checkpoint
in class RDD<scala.Tuple2<K,V>>
public org.apache.hadoop.conf.Configuration getConf()