gpt4 book ai didi

spring-batch - Spring Batch - 提高性能的步骤

转载 作者:行者123 更新时间:2023-12-04 01:11:57 25 4
gpt4 key购买 nike

我目前正在开发数据加载器。读取文件并写入数据库。我正在使用分区处理程序在 30 个线程中处理多个逗号分隔的文件。我想扩展并获得吞吐量。每天我收到 15000 个文件(每个文件有 100 万条记录),我如何使用 spring 批处理扩展。我希望这项工作在一天内完成。我们有任何开源网格计算吗?可以公平地做到这一点,或者是否有任何简单的微调步骤。

spring 批处理数据加载器独立运行。不涉及 Web 容器。它在具有 24 个 CPU 的单台 Solaris 机器上运行。数据写入单个数据库。提供默认隔离和传播。xml配置如下:

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:batch="http://www.springframework.org/schema/batch"
xmlns:p="http://www.springframework.org/schema/p"
xmlns:aop="http://www.springframework.org/schema/aop"
xmlns:context="http://www.springframework.org/schema/context"
xmlns:tx="http://www.springframework.org/schema/tx"
xmlns:task="http://www.springframework.org/schema/task"
xsi:schemaLocation="http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop.xsd
http://www.springframework.org/schema/batch http://www.springframework.org/schema/batch/spring-batch-2.1.xsd
http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd
http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd
http://www.springframework.org/schema/task http://www.springframework.org/schema/task/spring-task-3.0.xsd">

<!-- IMPORT DB CONFIG -->
<import resource="classpath:bom/bom/bomloader/job/DataSourcePoolConfig.xml" />

<!-- USE ANNOTATIONS TO CONFIGURE SPRING BEANS -->
<context:component-scan base-package="bom.bom.bom" />

<!-- INJECT THE PROCESS PARAMS HASHMAP BEFORE CONTEXT IS INITIALISED -->
<bean id="holder" class="bom.bom.bom.loader.util.PlaceHolderBean" >
<property name="beanName" value="holder"/>
</bean>

<bean id="logger" class="bom.bom.bom.loader.util.PlaceHolderBean" >
<property name="beanName" value="logger"/>
</bean>

<bean id="dataMap" class="java.util.concurrent.ConcurrentHashMap" />

<!-- JOB REPOSITORY - WE USE DATABASE REPOSITORY -->

<!-- <bean id="jobRepository" class="org.springframework.batch.core.repository.support.JobRepositoryFactoryBean" >-->
<!-- <property name="transactionManager" ref="frdtransactionManager" />-->
<!-- <property name="dataSource" ref="frddataSource" />-->
<!-- <property name="databaseType" value="oracle" />-->
<!-- <property name="tablePrefix" value="batch_"/> -->
<!-- </bean>-->

<!-- JOB REPOSITORY - WE IN MEMORY REPOSITORY -->

<bean id="jobRepository" class="org.springframework.batch.core.repository.support.MapJobRepositoryFactoryBean">
<property name="transactionManager" ref="frdtransactionManager" />
</bean>

<!-- <bean id="jobExplorer" class="org.springframework.batch.core.explore.support.JobExplorerFactoryBean">-->
<!-- <property name="dataSource" ref="frddataSource" />-->
<!-- <property name="tablePrefix" value="batch_"/> -->
<!-- </bean>-->

<!-- LAUNCH JOBS FROM A REPOSITORY -->

<bean id="jobLauncher" class="org.springframework.batch.core.launch.support.SimpleJobLauncher">
<property name="jobRepository" ref="jobRepository" />
<property name="taskExecutor">
<bean class="org.springframework.core.task.SyncTaskExecutor" />
</property>
</bean>

<!-- CONFIGURE SCHEDULING IN QUARTZ -->
<!-- <bean id="jobDetail" class="org.springframework.scheduling.quartz.JobDetailBean">-->
<!-- <property name="jobClass" value="bom.bom.bom.assurance.core.JobLauncherDetails" />-->
<!-- <property name="group" value="quartz-batch" />-->
<!-- <property name="jobDataAsMap">-->
<!-- <map>-->
<!-- <entry key="jobName" value="${jobname}"/>-->
<!-- <entry key="jobLocator" value-ref="jobRegistry"/>-->
<!-- <entry key="jobLauncher" value-ref="jobLauncher"/>-->
<!-- </map>-->
<!-- </property>-->
<!-- </bean>-->

<!-- RUN EVERY 2 HOURS -->
<!-- <bean class="org.springframework.scheduling.quartz.SchedulerFactoryBean">-->
<!-- <property name="triggers">-->
<!-- <bean id="cronTrigger" class="org.springframework.scheduling.quartz.CronTriggerBean">-->
<!-- <property name="jobDetail" ref="jobDetail" />-->
<!-- <property name="cronExpression" value="2/0 * * * * ?" />-->
<!-- </bean>-->
<!-- </property>-->
<!-- </bean>-->
<!-- -->

<!-- RUN STANDALONE -->
<bean id="jobRunner" class="bom.bom.bom.loader.core.DataLoaderJobRunner">
<constructor-arg value="${LOADER_NAME}" />
</bean>

<!-- Get all the files for the exchanges and feed as resource to the MultiResourcePartitioner -->

<bean id="fileresource" class="bom.bom.bom.loader.util.FiltersFoldersResourceFactory" p:dataMap-ref="dataMap">
<property name="filePath" value="${PARENT_PATH}" />
<property name="acceptedFolders" value="${EXCH_CODE}" />
<property name="logger" ref="logger" />
</bean>

<!-- The network Data Loading Configuration goes here -->

<job id="CDR_network _PARALLEL" xmlns="http://www.springframework.org/schema/batch" restartable="false" >
<step id="PREPARE_CLEAN" >
<flow parent="prepareCleanFlow" />
<next on="COMPLETED" to="LOAD_EXCHANGE_DATA" />
<fail on="FAILED" exit-code="Failed on cleaning error records."/>
</step>
<step id="LOAD_EXCHANGE_DATA" >
<tasklet ref="businessData" transaction-manager="ratransactionManager" />
<next on="COMPLETED" to="LOAD_CDR_FILES" />
<fail on="FAILED" exit-code="FAILED ON LOADING EXCHANGE INFORMATION FROM DB." />
</step>
<step id="LOAD_CDR_FILES" >
<tasklet ref="fileresource" transaction-manager="frdtransactionManager" />
<next on="COMPLETED" to="PROCESS_FILE_TO_STAGING_TABLE_PARALLEL" />
<fail on="FAILED" exit-code="FAILED ON LOADING CDR FILES." />
</step>
<step id="PROCESS_FILE_TO_STAGING_TABLE_PARALLEL" next="limitDecision" >
<partition step="filestep" partitioner="filepartitioner" >
<handler grid-size="100" task-executor="executorWithCallerRunsPolicy" />
</partition>
</step>
<decision id="limitDecision" decider="limitDecider">
<next on="COMPLETED" to="MOVE_RECS_STAGING_TO_MAIN_TABLE" />
<next on="CONTINUE" to="PROCESS_FILE_TO_STAGING_TABLE_PARALLEL" />
</decision>
<step id="MOVE_RECS_STAGING_TO_MAIN_TABLE" >
<tasklet ref="moveRecords" transaction-manager="ratransactionManager" >
<transaction-attributes isolation="SERIALIZABLE"/>
</tasklet>
<fail on="FAILED" exit-code="FAILED ON MOVING DATA TO THE MAIN TABLE." />
<next on="*" to="PREPARE_ARCHIVE"/>
</step>
<step id="PREPARE_ARCHIVE" >
<flow parent="prepareArchiveFlow" />
<fail on="FAILED" exit-code="FAILED ON Archiving files" />
<end on="*" />
</step>
</job>

<flow id="prepareCleanFlow" xmlns="http://www.springframework.org/schema/batch">
<step id="CLEAN_ERROR_RECORDS" next="archivefileExistsDecisionInFlow" >
<tasklet ref="houseKeeping" transaction-manager="ratransactionManager" />
</step>
<decision id="archivefileExistsDecisionInFlow" decider="archivefileExistsDecider">
<end on="NO_ARCHIVE_FILE" />
<next on="ARCHIVE_FILE_EXISTS" to="runprepareArchiveFlow" />
</decision>
<step id="runprepareArchiveFlow" >
<flow parent="prepareArchiveFlow" />
</step>
</flow>

<flow id="prepareArchiveFlow" xmlns="http://www.springframework.org/schema/batch" >
<step id="ARCHIVE_CDR_FILES" >
<tasklet ref="archiveFiles" transaction-manager="frdtransactionManager" />
</step>
</flow>

<bean id="archivefileExistsDecider" class="bom.bom.bom.loader.util.ArchiveFileExistsDecider" >
<property name="logger" ref="logger" />
<property name="frdjdbcTemplate" ref="frdjdbcTemplate" />
</bean>

<bean id="filepartitioner" class="org.springframework.batch.core.partition.support.MultiResourcePartitioner" scope="step" >
<property name="resources" value="#{dataMap[processFiles]}"/>
</bean>

<task:executor id="executorWithCallerRunsPolicy"
pool-size="90-95"
queue-capacity="6"
rejection-policy="CALLER_RUNS"/>

<!-- <bean id="dynamicJobParameters" class="bom.bom.bom.assurance.core.DynamicJobParameters" />-->


<bean id="houseKeeping" class="bom.bom.bom.loader.core.HousekeepingOperation">
<property name="logger" ref="logger" />
<property name="jdbcTemplate" ref="rajdbcTemplate" />
<property name="frdjdbcTemplate" ref="frdjdbcTemplate" />
</bean>

<bean id="businessData" class="bom.bom.bom.loader.core.BusinessValidatorData">
<property name="logger" ref="logger" />
<property name="jdbcTemplate" ref="NrajdbcTemplate" />
<property name="param" value="${EXCH_CODE}" />
<property name="sql" value="${LOOKUP_QUERY}" />
</bean>

<step id="filestep" xmlns="http://www.springframework.org/schema/batch">
<tasklet transaction-manager="ratransactionManager" allow-start-if-complete="true" >
<chunk writer="jdbcItenWriter" reader="fileItemReader" processor="itemProcessor" commit-interval="500" retry-limit="2">
<retryable-exception-classes>
<include class="org.springframework.dao.DeadlockLoserDataAccessException"/>
</retryable-exception-classes>
</chunk>
<listeners>
<listener ref="customStepExecutionListener">
</listener>
</listeners>
</tasklet>
</step>

<bean id="moveRecords" class="bom.bom.bom.loader.core.MoveDataFromStaging">
<property name="logger" ref="logger" />
<property name="jdbcTemplate" ref="rajdbcTemplate" />
</bean>

<bean id="archiveFiles" class="bom.bom.bom.loader.core.ArchiveCDRFile" >
<property name="logger" ref="logger" />
<property name="jdbcTemplate" ref="frdjdbcTemplate" />
<property name="archiveFlag" value="${ARCHIVE_FILE}" />
<property name="archiveDir" value="${ARCHIVE_LOCATION}" />
</bean>

<bean id="limitDecider" class="bom.bom.bom.loader.util.LimitDecider" p:dataMap-ref="dataMap">
<property name="logger" ref="logger" />
</bean>

<!-- <bean id="multifileReader" class="org.springframework.batch.item.file.MultiResourceItemReader" scope="step" >-->
<!-- <property name="resources" value="#{stepExecutionContext[fileName]}" />-->
<!-- <property name="delegate" ref="fileItemReader" />-->
<!-- </bean>-->

<!-- READ EACH FILE PARALLELY -->
<bean id="fileItemReader" scope="step" autowire-candidate="false" parent="itemReaderParent">
<property name="resource" value="#{stepExecutionContext[fileName]}" />
<property name="saveState" value="false" />
</bean>

<!-- LISTEN AT THE END OF EACH FILE TO DO POST PROCESSING -->
<bean id="customStepExecutionListener" class="bom.bom.bom.loader.core.StagingStepExecutionListener" scope="step">
<property name="logger" ref="logger" />
<property name="frdjdbcTemplate" ref="frdjdbcTemplate" />
<property name="jdbcTemplate" ref="rajdbcTemplate" />
<property name="sql" value="${INSERT_IA_QUERY_COLUMNS}" />

</bean>

<!-- CONFIGURE THE ITEM PROCESSOR TO DO BUSINESS LOGIC ON EACH ITEM -->
<bean id="itemProcessor" class="bom.bom.bom.loader.core.StagingLogicProcessor" scope="step">
<property name="logger" ref="logger" />
<property name="params" ref="businessData" />
</bean>

<!-- CONFIGURE THE JDBC ITEM WRITER TO WRITE IN TO DB -->
<bean id="jdbcItenWriter" class="org.springframework.batch.item.database.JdbcBatchItemWriter" scope="step">
<property name="dataSource" ref="radataSource"/>
<property name="sql">
<value>
<![CDATA[
${SQL1A}
]]>
</value>
</property>
<property name="itemSqlParameterSourceProvider">
<bean class="org.springframework.batch.item.database.BeanPropertyItemSqlParameterSourceProvider">
</bean>
</property>
</bean>

<!-- <bean id="itemWriter" class="bom.bom.bom.assurance.core.LoaderDBWriter" scope="step">-->
<!-- <property name="sQL" value="${loader.sql}" />-->
<!-- <property name="jdbcTemplate" ref="NrajdbcTemplate" />-->
<!-- </bean>-->


<!-- CONFIGURE THE FLAT FILE ITEM READER TO READ INDIVIDUAL BATCH -->
<bean id="itemReaderParent" class="org.springframework.batch.item.file.FlatFileItemReader" abstract="true">
<property name="strict" value="false"/>
<property name="lineMapper">
<bean class="org.springframework.batch.item.file.mapping.DefaultLineMapper">
<property name="lineTokenizer">
<bean class="org.springframework.batch.item.file.transform.FixedLengthTokenizer">
<property name="names" value="${COLUMNS}" />
<property name="columns" value="${RANGE}" />
</bean>
</property>
<property name="fieldSetMapper">
<bean class="bom.bom.bom.loader.util.DataLoaderMapper">
<property name="params" value="${BEANPROPERTIES}"/>
</bean>
</property>
</bean>
</property>
</bean>
</beans>

试过:
  • 我可以看到 ThreadPoolExecutor 在 3 小时后挂起。solaris 中的 prstat 说它正在处理,但日志中没有处理。
  • 尝试使用更少的块大小 500,由于内存占用,没有进展。
  • 由于它插入到单个数据库(30 个池连接)中。我可以在这里做些什么。

  • 来自视觉虚拟机的实例

    enter image description here

    线程的堆栈跟踪全部锁定在连接级别
    Full thread dump Java HotSpot(TM) Server VM (11.3-b02 mixed mode):

    "Attach Listener" daemon prio=3 tid=0x00bbf800 nid=0x26 waiting on condition [0x00000000..0x00000000]
    java.lang.Thread.State: RUNNABLE

    "executorWithCallerRunsPolicy-1" prio=3 tid=0x008a7000 nid=0x25 runnable [0xd5a7d000..0xd5a7fb70]
    java.lang.Thread.State: RUNNABLE
    at java.net.SocketInputStream.socketRead0(Native Method)
    at java.net.SocketInputStream.read(SocketInputStream.java:129)
    at oracle.net.ns.Packet.receive(Packet.java:240)
    at oracle.net.ns.DataPacket.receive(DataPacket.java:92)
    at oracle.net.ns.NetInputStream.getNextPacket(NetInputStream.java:172)
    at oracle.net.ns.NetInputStream.read(NetInputStream.java:117)
    at oracle.net.ns.NetInputStream.read(NetInputStream.java:92)
    at oracle.net.ns.NetInputStream.read(NetInputStream.java:77)
    at oracle.jdbc.driver.T4CMAREngine.unmarshalUB1(T4CMAREngine.java:1034)
    at oracle.jdbc.driver.T4CMAREngine.unmarshalSB1(T4CMAREngine.java:1010)
    at oracle.jdbc.driver.T4C8Oall.receive(T4C8Oall.java:588)
    at oracle.jdbc.driver.T4CPreparedStatement.doOall8(T4CPreparedStatement.java:194)
    at oracle.jdbc.driver.T4CPreparedStatement.executeForRows(T4CPreparedStatement.java:953)
    at oracle.jdbc.driver.OracleStatement.doExecuteWithTimeout(OracleStatement.java:1222)
    at oracle.jdbc.driver.OraclePreparedStatement.executeInternal(OraclePreparedStatement.java:3387)
    at oracle.jdbc.driver.OraclePreparedStatement.executeUpdate(OraclePreparedStatement.java:3468)
    - locked <0xdbdafa30> (a oracle.jdbc.driver.T4CConnection)
    at oracle.jdbc.driver.OraclePreparedStatementWrapper.executeUpdate(OraclePreparedStatementWrapper.java:1350)
    at org.springframework.jdbc.core.JdbcTemplate$2.doInPreparedStatement(JdbcTemplate.java:818)
    at org.springframework.jdbc.core.JdbcTemplate$2.doInPreparedStatement(JdbcTemplate.java:1)
    at org.springframework.jdbc.core.JdbcTemplate.execute(JdbcTemplate.java:587)
    at org.springframework.jdbc.core.JdbcTemplate.update(JdbcTemplate.java:812)
    at org.springframework.jdbc.core.JdbcTemplate.update(JdbcTemplate.java:868)
    at org.springframework.jdbc.core.JdbcTemplate.update(JdbcTemplate.java:876)
    at

    最佳答案

    我建议您将块大小降低到 50。

    500 似乎太大了:您在与 DB 交谈时等待的时间太长。

    同时,降低 TaskExecutor 的池大小或增加您的数据库池大小。
    您可以通过观察您的数据库主机来选择哪个:如果它的 CPU 和 IO 未达到最大值,则增加数据库池大小以增加数据库负载。如果您的 DB CPU 已经达到最大值,请降低 TaskExecutor 的池大小。目标是有一个流畅的过程。

    我认为 DB 将是您的主要限制因素。因此,首先根据数据库主机容量调整数据库池大小。完成后,根据 DB 池大小(TE 池大小 = DB 池大小 * 1.5)加上批处理的主机容量(CPU、内存和 IO)调整 TaskExecutor 的池大小。

    在多个硬盘驱动器上拆分传入的文件也可能有所帮助(如果可能)。

    关于spring-batch - Spring Batch - 提高性能的步骤,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/8163582/

    25 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com