组合文件输入格式始终只启动一张地图 Hadoop 1.2.1

2024-01-14

我正在尝试使用测试CombineFileInputFormat 来处理每个8 MB 的几个小文件（20 个文件）。我按照this中给出的示例进行操作blog http://yaseminavcular.blogspot.in/2011_03_01_archive.html。我能够实施并测试它。最终结果是正确的。但令我惊讶的是，它最终总是只有一张地图。我尝试将属性“mapred.max.split.size”设置为16MB、32MB等各种值（当然以字节为单位），但没有成功。我还需要做什么或者这是正确的行为吗？

我正在运行一个默认复制为 2 的两节点集群。下面给出的是开发的代码。非常感谢任何帮助。

package inverika.test.retail;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import org.apache.hadoop.mapreduce.Reducer;

public class CategoryCount {

    public static class CategoryMapper
        extends Mapper<LongWritable, Text, Text, IntWritable>    {

        private final static IntWritable one = new IntWritable(1);
        private String[] columns = new String[8];

        @Override
        public void map(LongWritable key, Text value, Context context)
                throws     IOException, InterruptedException {
            columns = value.toString().split(",");  
            context.write(new Text(columns[4]), one);
        }
    }

    public static class CategoryReducer
        extends Reducer< Text, IntWritable, Text, IntWritable>    {

        @Override
        public void reduce(Text key, Iterable<IntWritable>  values, Context context)
                throws     IOException, InterruptedException {

                int sum = 0;

                for (IntWritable value :  values) {
                        sum += value.get();
                }
               context.write(key, new IntWritable(sum));
        }
    }

    public static void main(String args[]) throws Exception    {
        if (args.length != 2)  {
                System.err.println("Usage: CategoryCount <input Path> <output Path>");
                System.exit(-1);
        } 

        Configuration conf = new Configuration();
        conf.set("mapred.textoutputformat.separator", ",");
        conf.set("mapred.max.split.size", "16777216");   // 16 MB

        Job job = new Job(conf, "Retail Category Count");
        job.setJarByClass(CategoryCount.class);
        job.setMapperClass(CategoryMapper.class);
        job.setReducerClass(CategoryReducer.class);
        job.setInputFormatClass(CombinedInputFormat.class);
        //CombineFileInputFormat.setMaxInputSplitSize(job, 16777216);
        CombinedInputFormat.setMaxInputSplitSize(job, 16777216);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        FileInputFormat.addInputPath(job, new Path(args[0]) );
        FileOutputFormat.setOutputPath(job, new Path(args[1]) );
        //job.submit();
        //System.exit(job.waitForCompletion(false) ?  0 : 1);
        System.exit(job.waitForCompletion(true) ?  0 : 1);
    }
}

这是实现的组合 FileInputFormat

package inverika.test.retail;

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;

public class CombinedInputFormat extends CombineFileInputFormat<LongWritable, Text> {

    @Override
    public RecordReader<LongWritable, Text>
            createRecordReader(InputSplit split, TaskAttemptContext context)
                    throws IOException {

        CombineFileRecordReader<LongWritable, Text> reader = 
                new CombineFileRecordReader<LongWritable, Text>(
                        (CombineFileSplit) split, context, myCombineFileRecordReader.class);        
        return reader;
    }

    public static class myCombineFileRecordReader extends RecordReader<LongWritable, Text> {
        private LineRecordReader lineRecordReader = new LineRecordReader();

        public myCombineFileRecordReader(CombineFileSplit split, 
                TaskAttemptContext context, Integer index) throws IOException {

            FileSplit fileSplit = new FileSplit(split.getPath(index), 
                                                split.getOffset(index),
                                                split.getLength(index), 
                                                split.getLocations());
            lineRecordReader.initialize(fileSplit, context);
        }

        @Override
        public void initialize(InputSplit inputSplit, TaskAttemptContext context)
                throws IOException, InterruptedException {
            //linerecordReader.initialize(inputSplit, context);
        }

        @Override
        public void close() throws IOException {
            lineRecordReader.close();
        }

        @Override
        public float getProgress() throws IOException {
            return lineRecordReader.getProgress();
        }

        @Override
        public LongWritable getCurrentKey() throws IOException,
                InterruptedException {
            return lineRecordReader.getCurrentKey();
        }

        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            return lineRecordReader.getCurrentValue();
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return lineRecordReader.nextKeyValue();
        }        
    }
}

使用时需要设置最大分割大小CombineFileInputFormat作为输入格式类。或者你可能会得到准确的ONLY ONE当所有块都来自同一机架时映射器。

您可以通过以下方式之一实现此目的：

打电话给CombineFileInputFormat.setMaxSplitSize() https://hadoop.apache.org/docs/r2.2.0/api/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.html#setMaxSplitSize(long) method
set mapreduce.input.fileinputformat.split.maxsize or ~~mapred.max.split.size~~(deprecated) configuration parameter
For exmaple, by issuing the following call
```
job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.maxsize", (long)(256*1024*1024));
```
您将最大分割大小设置为 256MB。

参考：

https://hadoop.apache.org/docs/r2.2.0/api/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.html https://hadoop.apache.org/docs/r2.2.0/api/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.html
http://mail-archives.apache.org/mod_mbox/hadoop-common-user/201004.mbox/%[电子邮件受保护]%3E http://mail-archives.apache.org/mod_mbox/hadoop-common-user/201004.mbox/%3C35374.30384.qm@web63402.mail.re1.yahoo.com%3E

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

Hadoop

MapReduce

组合文件输入格式始终只启动一张地图 Hadoop 1.2.1 的相关文章

Hadoop NoSuchMethodError apache.commons.cli

我在用着hadoop 2 7 2我用 IntelliJ 做了一个 MapReduce 工作在我的工作中我正在使用apache commons cli 1 3 1我把库放在罐子里当我在 Hadoop 集群上使用 MapReduceJob
从 Spark 访问 Hdfs 会出现令牌缓存错误 Can't get Master Kerberosprincipal for use as renewer

我正在尝试运行测试 Spark 脚本以便将 Spark 连接到 hadoop 脚本如下 from pyspark import SparkContext sc SparkContext local Simple App file sc t
如何找到 JAR：/home/hadoop/contrib/streaming/hadoop-streaming.jar

我正在练习有关 Amazon EMR 的复数视角视频教程我被困住了因为我收到此错误而无法继续 Not a valid JAR home hadoop contrib streaming hadoop streaming jar 请注意
无法验证 serde：org.openx.data.jsonserde.jsonserde

我编写了这个查询来在配置单元上创建一个表我的数据最初是 json 格式所以我已经下载并构建了 serde 并添加了它运行所需的所有 jar 但我收到以下错误 FAILED Execution Error return code 1 fr
Hadoop 上的 Sqoop：NoSuchMethodError：com.google.common.base.Stopwatch.createStarted() [重复]

这个问题在这里已经有答案了我在 Google Cloud DataProc 上的 hadoop 上运行 sqoop 以通过 Cloud SQL 代理访问 postgresql 但遇到 Java 依赖项错误 INFO First Cloud
在 Hadoop MapReduce 中为二进制文件创建自定义 InputFormat 和 RecordReader

我正在编写一个 M R 作业该作业处理以二进制格式编写的大型时间序列数据文件如下所示此处换行以提高可读性显然实际数据是连续的 TIMESTAMP 1 TIMESTAMP 1 TIMESTAMP 2 TIMESTAMP 2 TIME
Hive：在查询中将 array 转换为 array

我有两张桌子 create table a 1 array
Java8：使用 Stream / Map-Reduce / Collector 将 HashMap 转换为 HashMap

我知道如何改造一个简单的JavaList from Y gt Z i e List
Apache Spark 何时发生混洗？

我正在优化 Spark 中的参数并且想确切地了解 Spark 是如何对数据进行洗牌的准确地说我有一个简单的字数统计程序并且想知道spark shuffle file buffer kb如何影响运行时间现在当我将此参数设置得非常高
Python 包安装：pip 与 yum，还是两者一起安装？

我刚刚开始管理 Hadoop 集群我们使用 Bright Cluster Manager 直至操作系统级别 CentOS 7 1 然后使用 Ambari 以及适用于 Hadoop 的 Hortonworks HDP 2 3 我不断收到安装
使用 org.apache.hadoop/* 依赖项离线编译 sbt 时遇到的问题

使用依赖于 org apache hadoop 包的 sbt 进行离线编译时遇到很多麻烦一个简单的build sbt name Test version 1 0 scalaVersion 2 10 4 libraryDependencie
MongoDB/PyMongo：如何在 Map 函数中使用点表示法？

我正在尝试计算每个邮政编码中找到的记录数在我的 MongoDB 中嵌入了邮政编码使用点表示法它位于 a res z a 代表地址 res 代表住宅 z 代表邮政编码例如这工作得很好 db NY count a res z 141
以不同用户身份运行 MapReduce 作业

我有一个与 Hadoop 交互的 Web 应用程序 Cloudera cdh3u6 特定的用户操作应在集群中启动新的 MapReduce 作业该集群不是一个安全集群但它使用简单的组身份验证因此如果我以自己的身份通过 ssh 连接到它
如何用snappy解压hadoop的reduce输出文件尾？

我们的 hadoop 集群使用 snappy 作为默认编解码器 Hadoop作业减少输出文件名就像part r 00000 snappy JSnappy 无法解压缩文件 bcz JSnappy 需要以 SNZ 开头的文件归约输出文件以某种
以编程方式读取 Hadoop Mapreduce 程序的输出

这可能是一个基本问题但我在谷歌上找不到答案我有一个映射缩减作业它在其输出目录中创建多个输出文件我的 Java 应用程序在远程 hadoop 集群上执行此作业作业完成后需要使用以下命令以编程方式读取输出org apache had
伪模式下没有名称节点错误

我是hadoop新手正处于学习阶段根据 Hadoop Definitve 指南我已将 hadoop 设置为伪分布式模式一切正常昨天我什至能够执行第三章中的所有示例今天当我重新启动我的unix并尝试运行start dfs sh然
Protobuf RPC 在 Hadoop 2.2.0 单节点服务器上不可用？

我正在尝试在按照本教程安装的本地单节点集群上运行 hadoop 2 2 0 mapreduce 作业 http codesfusion blogspot co at 2013 10 setup hadoop 2x 220 on ubuntu
Hadoop - 直接从 Mapper 写入 HBase

我有一个 hadoop 作业其输出应写入 HBase 我并不真正需要减速器我想要插入的行类型是在映射器中确定的如何使用 TableOutputFormat 来实现此目的从所有示例中我看到的假设是 reducer 是创建 Put 的
MapReduce 中 1 个任务的减速器数量

在典型的 MapReduce 设置如 Hadoop 中 1 个任务使用多少个减速器例如计算单词数我对 Google MapReduce 的理解意味着只涉及 1 个减速器那是对的吗例如单词计数会将输入分为 N 个块并且 N 个
Hadoop fs 查找块大小？

在 Hadoop fs 中如何查找特定文件的块大小我主要对命令行感兴趣例如 hadoop fs hdfs fs1 data 但看起来这并不存在有Java解决方案吗 The fsck其他答案中的命令列出了块并允许您查看块的数量但是要

随机推荐

什么会导致抛出异常 16：“mutex: Resource busy”（使用 Boost / BB10）？

我已将一个用 C 和 Boost 编写的长期稳定的库移植到 Blackberry 10 该库在设备之间传输文件该库编译和链接良好并且运行良好但是在传输 1 2 或 3 个文件后我的 Blackberry 10 设备上总是会遇到抛出
Android 在任务锁定时运行另一个应用程序

我将我的应用程序设置为设备所有者并且当我拨打电话时屏幕固定startLockTask 当我尝试使用此方法运行另一个应用程序时我现在遇到的问题是 Intent i getPackageManager getLaunchIntentForP
MongoDB 社区无法在 Mac 上启动并出现错误

我写这封信是为了告知我已经使用brew High Sierra 安装了mongodb community并且它正在运行然后最近我升级了它似乎无法连接当我在终端中运行 mongo 时出现错误 MongoDB shell versio
如何使用flot js绘制按类别分组的多个条形图

我正在使用 flot 库来显示条形图我需要以这种方式按特定值进行分组
为什么在我的应用程序中更改选项卡时生命周期挂钩不运行？

我在 Angular github 存储库上发布了这个问题但被告知我所问的既不是错误也不是新功能他们建议我在这里发布这个问题我们的 Angular 应用程序使用带有 canActivate 防护的路由器当用户在浏览器上添加新选项卡
Rails 2.3.11 的稳定 rubygems 版本是什么？

现在我有 ruby gems 版本 1 3 7 我刚刚安装了 Rails 2 3 11 我也想更新我的 ruby gems Rails 2 3 11 的稳定 ruby gems 版本是什么我正在使用 RubyGems 1 7 2 运行 2
URLConnection 或 HTTPClient：哪个提供更好的功能和更高的效率？

我正在寻找为 Android 应用程序创建一个登录表单我想使用 post 方法将信息发送到服务器端由 PHP 文件处理它反过来验证参数并发回响应我查看了使用 HttpClient 和 URLConnection 的实现它们非常相似
如何修复 anaconda 中的“启动器中的致命错误：无法使用 *path*/scrapy.exe 创建进程”？ [复制]

这个问题在这里已经有答案了我正在尝试在 Windows 10 上安装 scrapy 通过遵循这些教程 https docs scrapy org en latest intro tutorial html https docs scrap
分配跟踪库未及时加载，无法查看堆栈跟踪

For opening Instruments we must profile first then select which template we have to profile memory leaks allocations zom
textPath 上的 SVG 带圆圈文本（中心对齐）

我遇到了与 SVG 相关的带圆圈文本的问题我的目标是创建一条路径允许我在上面书写同时将文本居中仍然跟踪我的路径从圆圈的顶部开始 Example 看起来就是这样 https i imgur com AtmrTUG png 内图 Pr
Python 请求和 __doPostBack 函数

我一整天都在为此苦苦挣扎我需要从一个网站上抓取数据该网站有一个按钮您需要单击该按钮才能查看数据按钮本身调用了 ASP NET 网站使用的这个著名的 dopostback javascript 函数 a class btn btn d
将 `issubclass()` 与 Django 模型结合使用

我有一些 Django 模型比如 class Foo models Model class Meta abstract True class Bar Foo pass 我希望能够找到从 Foo 继承的所有模型以便用它们执行任务这应该很
如何使用 Data.Time.UTCTime 将分钟添加到当前时间？

我想在给定的给定时间上添加减去给定的分钟数并找出结果时间例如假设给定时间为 11 30AM 要添加的分钟数为 100 则结果时间为 01 10PM 如何在 Haskell 中使用 Data Time 库做到这一点我尝试阅读Haske
ggplot2：一个图例，具有从公共变量派生的两个视觉属性

如何获得同时捕获颜色和大小的单个图例我的印象是如果使用公共变量则公共图例是默认的但下面的示例表明我遗漏了一些东西 library ggplot2 input lt as data frame matrix runif 60 nrow
尝试实例化 AVAudioPlayer 时，NSBundle 找到 mp3 文件，但找不到 m4a

我有一个简单的应用程序应该播放音频文件每当我将 mp3 文件加载到 NSURL 时效果都很好并且 AVAudioPlayer 可以播放歌曲但是当我更改为 m4a 文件时 NSString 路径返回 nil 就像它无法找到我的文件
更新 slick 中表格的前 n 行

我想在使用时更新表的前 n 行而不是整个行slick 3 0 这是更新所有版本 private this val active this filter a gt a status AccountStatus DISABLED db run
Python 扩展 - 有效地构造和检查大整数

我有一个本机库其自然接口将涉及传递潜在的大量数字我预计大约有一半如果我可以将值限制在单个寄存器中那么 PyLong FromUnsignedLongLong 和 PyLong AsUnsignedLongLong 将是合适的 PyL
禁用泛型中的“类型参数推断”的方法？

我希望为泛型赋予默认值将优先于类型推断但事实并非如此 placeholder for a real express response send const sendResponse x gt console log x function
将 mysite.com/app.php?appname=example-name&appid=numeric-id 重写为 mysite.com/app/app-name/numeric-id/

我是 PHP 和服务器端操作的新手所以在这里问这个基本问题我在这里找到了很多类似的问题和答案但我未能实现我想要的我如何编写 htaccess 文件来生成这些结果动态网址 mysite com app php appname exa
组合文件输入格式始终只启动一张地图 Hadoop 1.2.1

我正在尝试使用测试CombineFileInputFormat 来处理每个8 MB 的几个小文件 20 个文件我按照this中给出的示例进行操作blog http yaseminavcular blogspot in 2011 03 01

组合文件输入格式始终只启动一张地图 Hadoop 1.2.1

组合文件输入格式始终只启动一张地图 Hadoop 1.2.1 的相关文章

随机推荐

热门标签