web-dev-qa-db-ja.com

Spark EMRでのS3スローダウンエラー

寄木細工のファイルを書き込むときにこのエラーが発生します。これは最近発生し始めました

com.Amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: Please reduce your request rate. (Service: Amazon S3; Status Code: 503; Error Code: SlowDown; Request ID: 2CA496E2AB87DC16), S3 Extended Request ID: 1dBrcqVGJU9VgoT79NAVGyN0fsbj9+6bipC7op97ZmP+zSFIuH72lN03ZtYabNIA2KaSj18a8ho=
    at com.Amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.Java:1389)
    at com.Amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient.executeOneRequest(AmazonHttpClient.Java:902)
    at com.Amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.Java:607)
    at com.Amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient.doExecute(AmazonHttpClient.Java:376)
    at com.Amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient.executeWithTimer(AmazonHttpClient.Java:338)
    at com.Amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.Java:287)
    at com.Amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.Java:3826)
    at com.Amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.AmazonS3Client.deleteObjects(AmazonS3Client.Java:1777)
    at com.Amazon.ws.emr.hadoop.fs.s3.lite.call.DeleteObjectsCall.perform(DeleteObjectsCall.Java:22)
    at com.Amazon.ws.emr.hadoop.fs.s3.lite.call.DeleteObjectsCall.perform(DeleteObjectsCall.Java:7)
    at com.Amazon.ws.emr.hadoop.fs.s3.lite.executor.GlobalS3Executor.execute(GlobalS3Executor.Java:75)
    at com.Amazon.ws.emr.hadoop.fs.s3.lite.AmazonS3LiteClient.invoke(AmazonS3LiteClient.Java:176)
    at com.Amazon.ws.emr.hadoop.fs.s3.lite.AmazonS3LiteClient.deleteObjects(AmazonS3LiteClient.Java:125)
    at com.Amazon.ws.emr.hadoop.fs.s3n.Jets3tNativeFileSystemStore.deleteAll(Jets3tNativeFileSystemStore.Java:355)
    at Sun.reflect.GeneratedMethodAccessor121.invoke(Unknown Source)
    at Sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.Java:43)
    at Java.lang.reflect.Method.invoke(Method.Java:498)
    at org.Apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.Java:191)
    at org.Apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.Java:102)
    at com.Sun.proxy.$Proxy28.deleteAll(Unknown Source)
    at com.Amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.doSingleThreadedBatchDelete(S3NativeFileSystem.Java:1331)
    at com.Amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.delete(S3NativeFileSystem.Java:663)
    at com.Amazon.ws.emr.hadoop.fs.EmrFileSystem.delete(EmrFileSystem.Java:296)
    at org.Apache.hadoop.mapreduce.lib.output.FileOutputCommitter.cleanupJob(FileOutputCommitter.Java:463)
    at org.Apache.hadoop.mapreduce.lib.output.FileOutputCommitter.abortJob(FileOutputCommitter.Java:482)
    at org.Apache.spark.internal.io.HadoopMapReduceCommitProtocol.abortJob(HadoopMapReduceCommitProtocol.scala:134)
    at org.Apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:146)
    at org.Apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:121)
    at org.Apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:121)
    at org.Apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
    at org.Apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:121)
    at org.Apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:101)
    at org.Apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
    at org.Apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
    at org.Apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
    at org.Apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
    at org.Apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
    at org.Apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
    at org.Apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.Apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
    at org.Apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
    at org.Apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:87)
    at org.Apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:87)
    at org.Apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:492)
    at org.Apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:215)
    at org.Apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:198)
    at org.Apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:494)
    at com.radius.network_data.ingestion.processors.NetworkDataPathProcessor.process(NetworkDataPathProcessor.scala:38)
    at com.radius.network_data.ingestion.NetworkDataIngestionPipeline.com$radius$network_data$ingestion$NetworkDataIngestionPipeline$$processClient(NetworkDataIngestionPipeline.scala:51)
    at com.radius.network_data.ingestion.NetworkDataIngestionPipeline$$anonfun$run$1$$anonfun$apply$1.apply(NetworkDataIngestionPipeline.scala:42)
    at com.radius.network_data.ingestion.NetworkDataIngestionPipeline$$anonfun$run$1$$anonfun$apply$1.apply(NetworkDataIngestionPipeline.scala:41)
    at scala.collection.immutable.Set$Set1.foreach(Set.scala:94)
    at com.radius.network_data.ingestion.NetworkDataIngestionPipeline$$anonfun$run$1.apply(NetworkDataIngestionPipeline.scala:41)
    at com.radius.network_data.ingestion.NetworkDataIngestionPipeline$$anonfun$run$1.apply(NetworkDataIngestionPipeline.scala:39)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at scala.collection.parallel.ParIterableLike$Foreach.leaf(ParIterableLike.scala:972)
    at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply$mcV$sp(Tasks.scala:49)
    at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
    at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
    at scala.collection.parallel.Task$class.tryLeaf(Tasks.scala:51)
    at scala.collection.parallel.ParIterableLike$Foreach.tryLeaf(ParIterableLike.scala:969)
    at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask$class.internal(Tasks.scala:159)
    at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:443)
    at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask$class.compute(Tasks.scala:149)
    at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:443)
    at scala.concurrent.forkjoin.RecursiveAction.exec(RecursiveAction.Java:160)
    at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.Java:260)
    at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.Java:1339)
    at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.Java:1979)
    at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.Java:107)

何が問題になるかについての考えはありますか?これらの書き込みオプションを使用しています

result.write
      .mode(SaveMode.Overwrite)
      .partitionBy("entityType")
      .parquet(joinedPath)
17

どうやらこの問題は非常に小さなパーツファイルを大量に書き込んだことが原因であると思われ、出力パーティションの数を減らすことで修正されました

12