gpt4 book ai didi

Java 代码无法处理巨大的 CSV 文件

转载 作者:行者123 更新时间:2023-12-02 09:34:35 24 4
gpt4 key购买 nike

我是 Java 新手,我必须编写一段代码来处理巨大的 CSV 文件,更具体地说,是旋转文件并以不同的方式输出它。

当我用数百行测试代码时,它工作得很好!但是当我将文件设置为大约 700k 行时(事实就是如此),它会输出一个没有任何值的文件!

这是我正在使用的文件架构:

row_number,Time_float,V1_float,V2_float,V3_float,V4_float,V5_float,V6_float,V7_float,V8_float,V9_float,V10_float,V11_float,V12_float,V13_float,V14_float,V15_float,V16_float,V17_float,V18_float,V19_float,V20_float,V21_float,V22_float,V23_float,V24_float,V25_float,V26_float,V27_float,V28_float,Amount_float,Class_float
0,-1.996583023457193,-0.6942423209592997,-0.04407492457044802,1.6727734992241514,0.973365514375461,-0.24511658354252203,0.34706794516190337,0.19367893831762997,0.0826372794108694,0.33112778320993075,0.08338554524255039,-0.540407035731003,-0.6182957177945313,-0.9960989219768981,-0.32461018632700356,1.6040138389062168,-0.5368328685192513,0.24486345402090126,0.030769932602201018,0.4962820266510568,0.3261180160164485,-0.024923364961491876,0.38285443833968436,-0.17691133433749112,0.1105069205607409,0.2465854429694212,-0.3921704315485548,0.3308916226487169,-0.06378115069750527,0.24496426337017338,-0.04159897836869265
1,-1.996583023457193,0.608496327692869,0.16117591988327584,0.10979710210602639,0.31652292674812704,0.043483352047263765,-0.06181996595114916,-0.06370020974401087,0.07125348305960227,-0.23249418894056792,-0.15334962861450452,1.5800028495181635,1.0660885709726662,0.49141820388511015,-0.14998248182742244,0.6943604186631746,0.5294337538197206,-0.13516996909847068,-0.21876258231230966,-0.17908604930753447,-0.08961086263099204,-0.3073768045456866,-0.880076754448717,0.16220118362512442,-0.5611305498033158,0.3206939009070385,0.2610694754212664,-0.02225567818703569,0.04460751768012477,-0.3424745411051305,-0.04159897836869265
2,-1.9965619655334634,-0.6935004627168448,-0.8115778263099092,1.1694684928233277,0.2682312938426663,-0.3645717858377013,1.3514535859514658,0.6397756379029846,0.20737272949225669,-1.3786753514283783,0.19069961380161685,0.6118297100478469,0.06613661868011089,0.720699852326139,-0.173113888845362,2.5629061849805295,-3.298235372241396,1.3068678794417397,-0.14478999149835625,-2.7785608505208725,0.6809749715218957,0.33763169617058825,1.0633582711230263,1.456319745720107,-1.1380921384645095,-0.6285367205194723,-0.2884467520153837,-0.1371368556920797,-0.18102082710565728,1.1606859252297228,-0.04159897836869265
3,-1.9965619655334634,-0.4933248981469483,-0.11216942463929835,1.1825164508596404,-0.6097266412236745,-0.007468880343610824,0.9361498321757143,0.19207063819763437,0.3160175995113439,-1.2625031722059106,-0.050467953146436886,-0.22189161429591525,0.17837098770941834,0.5101687012441243,-0.30036049360793543,-0.6898374093803747,-1.2092959931757417,-0.8054446421603216,2.3453045221536306,-1.5142049233314772,-0.26985522543866364,-0.1474432966746349,0.007266907401368897,-0.30477654737727394,-1.941027139612268,1.2419037126390184,-0.46021734156146177,0.15539620725948264,0.1861885865396321,0.14053425198451397,-0.04159897836869265
4,-1.9965409076097336,-0.5913297637052182,0.5315410497431424,1.0214116755556233,0.28465540421364943,-0.29501543610441505,0.07199858317395955,0.4793022833609244,-0.22651023121331143,0.7443262870821311,0.6916250322008002,-0.8061465859382944,0.5386266478753359,1.3522443515849518,-1.1680335140679103,0.1913234721415302,-0.5152051215648076,-0.2790807862169683,-0.04556900286007717,0.987037297022314,0.5299387935696488,-0.012839217648641888,1.1000112712256314,-0.2201233960065819,0.2332500879381676,-0.39520164174981287,1.041611299621563,0.5436197963831321,0.6518159160992683,-0.07340334025310606,-0.04159897836869265
5,-1.9965409076097336,-0.21747462336443635,0.5816748868716723,0.7525853776136525,-0.11883332826086528,0.30500895924490323,-0.02231347965260122,0.3849359380956985,0.21795466774987732,-0.5176185728033812,-0.34110109646946285,1.3140464540936228,0.360182114067614,-0.3597915734090633,-0.1430571341620137,0.5655071130706558,0.45845966002880245,-0.06844505898436722,0.08190792006276612,-0.04077665181354113,0.1102154145899529,-0.2835222214491732,-0.7714270019345486,-0.04227284666287563,-0.6132733979473761,-0.4465835771366699,0.21963714625507125,0.6289004813773338,0.24563620173852233,-0.33855641663507124,-0.04159897836869265
6,-1.996498791762274,0.6277951843506749,0.08538910070753938,0.029922970138274736,0.8493831061038468,0.139019583041878,0.20469452783029024,-0.004170268076019058,0.0679975594125977,0.4232179413095267,-0.0911553627796218,-1.3881568003685008,-0.15394904162357412,-0.7546302429818224,0.174601525806113,0.05478291827131152,-0.5062323708749048,0.0033208458038780472,-0.7301429071537743,-0.05598631103904616,-0.28489532870127926,-0.22833366503144684,-0.373032409163239,-0.24677959261431537,-1.2879725270568725,1.4390366584060272,-0.5334361005147672,0.08549234948827854,0.015655983019391424,-0.33327894285907317,-0.04159897836869265
7,-1.9964356179910847,-0.3289283471917919,0.8586923440261,0.708576234767151,-0.34763105227477603,0.6875116870747386,0.32134542115371234,0.9058597727667212,-3.1882293041144427,0.5601291069522123,1.1474295272213637,-0.6068981847284727,0.2917078257188488,1.7663144960411647,-1.381049034972912,0.7496140133583983,-0.08687803964000758,-1.4389216521167638,-0.42738292340326745,0.39863532675456365,-0.20331695379069067,2.6458886485220905,-1.3992756084115,0.0920853274818958,-1.0727537219105157,-0.7966329656594401,-0.1070748551085608,-2.9901537037868864,-3.2880827705218882,-0.19010747625415497,-0.04159897836869265
8,-1.9964356179910847,-0.45657301681652984,0.17329147453401414,-0.07465262179597631,-0.19177387759003722,1.9341493665366207,2.793594054309211,0.29920595987227944,0.7125916765850463,-0.35685128708200337,-0.3769401127987114,-0.690809076979848,-0.11054073462883784,-0.2876133294193531,0.07756710959787069,-0.3592023119205175,-0.23974544333000244,-0.5884222737443606,0.14169463193440693,0.7006152251433341,0.06840582416474023,-0.09996300675392492,-0.36942472981431596,-0.327055266660248,1.6702690915973604,0.7159429034669824,-0.7966330619330197,0.02910414108912445,0.43142017601489174,0.019392240308193916,-0.04159897836869265
9,-1.9963935021436252,-0.17269774387883696,0.6780049203798014,0.6887814922128964,-0.15692675912027115,0.36179160642264196,-0.1852187109108031,0.5267057814810946,0.05822291589340413,-0.6705871938144513,-0.33691175301635523,0.996966068530707,0.837059523154456,1.0116259929700517,-0.46268060306897046,0.1641174886473323,0.8438820412040243,-0.6369448535966505,0.568708693313514,0.5549770030986336,0.2642433515365529,-0.336155613099156,-0.8732980464093,-0.19343791270246957,-0.6357673003859946,-0.1337734549071511,0.195341606203293,0.6100096992904638,0.25168133078356686,-0.33851643577313184,-0.04159897836869265
10,-1.9963724442198953,0.7398016180995844,-0.7123689085952064,0.6027095930866821,-0.9716078839795883,-1.4282856664438242,-0.4722411589208851,-1.150469214075524,0.04057090066824084,-1.5659577306254295,1.493926862783695,1.1753019928647082,-0.6719776055792052,-0.5163883874999073,-0.09915047715944188,0.2522962688421534,0.03648207425930615,0.29836818807428966,1.0192908179449345,-0.27193462986307226,-0.5022889927521785,-0.012663864860947654,0.43254001227028344,0.0444226909424958,0.826410614996712,0.4822144153958534,-0.26850047275423083,0.10616079403669354,0.04923997135326903,-0.32204432065410754,-0.04159897836869265

这是代码:

import com.opencsv.CSVReader;
import com.opencsv.CSVWriter;
import java.io.FileWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

public class test {

public static void main(String[] args) {
try {
//csv file containing data
CSVReader reader = new CSVReader(new FileReader("/Users/csv-parser/sentiment.csv"));
CSVWriter writer = new CSVWriter(new FileWriter("/Users/csv-parser/Output.csv"));
String [] nextLine;
String [] Target_Columns = new String[] {"row_number", "column_name", "column_value_string", "column_value_float"};
List<String> Columns = null;
long start = System.currentTimeMillis();

int lineNumber = 0;

while (true) {
if ((nextLine = reader.readNext()) == null) break;
lineNumber++;
if (lineNumber == 1) {
writer.writeNext( Target_Columns );
Columns = Arrays.asList( nextLine );
// System.out.println( Arrays.toString( nextLine ) );
continue;
}
System.out.println( Arrays.toString( nextLine ) );
int indexOfRow = Columns.indexOf( "row_number" );
String row_value = "{ROW_NUMBER},{COl_NAME},{COL_STR},{COL_FL}";
int i = 1;
for(String column: Columns){
if (column.equals("row_number")) continue;
row_value = row_value.replace( "{ROW_NUMBER}", nextLine[0]);

if (column.contains( "_string" )) {
// System.out.print( "col_str: " + finalNextLines[i] + " " );
row_value = row_value.replace( "{COL_STR}", nextLine[i] );
row_value = row_value.replace( "{COL_FL}", " " );
row_value = row_value.replace( "{COl_NAME}", column.replace( "_string", "" ) );
}
if (column.contains( "_float" )) {
// System.out.print( "col_fl: " + finalNextLines[i] + " " );
row_value = row_value.replace( "{COL_STR}", " " );
row_value = row_value.replace( "{COL_FL}", nextLine[i] );
row_value = row_value.replace( "{COl_NAME}", column.replace( "_float", "" ) );
}


i++;
// System.out.println( "ROW: " + row_value + " " );
writer.writeNext( row_value.split( "," ) );
row_value = "{ROW_NUMBER},{COl_NAME},{COL_STR},{COL_FL}";
}

// System.out.println("\n");
}
writer.close();
long finish = System.currentTimeMillis();
System.out.println( "Time elapsed: " + (finish - start));

} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

还有,有没有办法让代码同时读取/处理文件或使用多线程,我的意思是提高效率?

更新:

这是我处理原始文件时的输出结果 output

最佳答案

如果没有完整的堆栈跟踪,真的很难说出哪里出了问题,因为您的代码在性能、JVM 内存使用等方面存在很多潜在问题。我认为为您提供一个正确的解决方案更简单;至少,它可以处理 700k 文本文件。

import com.opencsv.CSVReader;
import com.opencsv.CSVWriter;

import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;

public class ReadJSONExample {

public static void main(String[] args) throws IOException {
// Move time calculation out of main logic
long start = System.currentTimeMillis();

new CsvDataModifier(Paths.get("e:/data.csv"), Paths.get("e:/res.csv")).apply();

long finish = System.currentTimeMillis();
System.out.printf("Time elapsed: %d ms", finish - start);
}
}

final class CsvDataModifier {

private final Path src;
private final Path dest;

public CsvDataModifier(Path src, Path dest) {
this.src = src;
this.dest = dest;
}

private static final int ROW_NUMBER = 0;
private static final int COLUMN_NAME = 1;
private static final int COLUMN_VALUE_STRING = 2;
private static final int COLUMN_VALUE_FLOAT = 3;
private static final String[] COLUMN_NAMES = { "row_number", "column_name", "column_value_string", "column_value_float" };

public void apply() throws IOException {
// Use `try with recources` to close streams correctly
try (CSVReader reader = new CSVReader(new FileReader(src.toFile()));
CSVWriter writer = new CSVWriter(new FileWriter(dest.toFile()), ',', CSVWriter.NO_QUOTE_CHARACTER)) {
// key - ordered list of columns in source file
Map<String, Marker> columnNameFloatMarker = getSourceColumnNamesWithFloatMarker(reader.readNext());
int posRowNumber = getRowNumberPosition(columnNameFloatMarker.keySet());

if (columnNameFloatMarker.isEmpty())
return;

writer.writeNext(COLUMN_NAMES);

// Create buffer only once for all lines; do not use string concatenation or replacing
String[] buf = new String[COLUMN_NAMES.length];

reader.forEach(values -> {
buf[ROW_NUMBER] = values[posRowNumber];

int col = 0;
// this is just reference to buf[]; if `null` then no output
String[] resultLine;

for (Map.Entry<String, Marker> entry : columnNameFloatMarker.entrySet()) {
String columnName = entry.getKey();
Marker marker = entry.getValue();

if ((resultLine = marker.createResultLine(columnName, values[col], buf)) != null)
writer.writeNext(resultLine);

col++;
}
});
}
}

private static final String FLOAT = "_float";
private static final String STRING = "_string";

private enum Marker {
NONE {
@Override
public String[] createResultLine(String columnName, String value, String[] buf) {
return null;
}
},
STRING {
@Override
public String[] createResultLine(String columnName, String value, String[] buf) {
buf[COLUMN_VALUE_STRING] = value;
buf[COLUMN_VALUE_FLOAT] = " ";
buf[COLUMN_NAME] = columnName;
return buf;
}
},
FLOAT {
@Override
public String[] createResultLine(String columnName, String value, String[] buf) {
buf[COLUMN_VALUE_STRING] = " ";
buf[COLUMN_VALUE_FLOAT] = value;
buf[COLUMN_NAME] = columnName;
return buf;
}
};

public abstract String[] createResultLine(String columnName, String value, String[] buf);
}

// Source column preprocessing to avoid string comparision; do it only once
private static Map<String, Marker> getSourceColumnNamesWithFloatMarker(String... columns) {
if (columns == null || columns.length == 0)
return Collections.emptyMap();

Map<String, Marker> map = new LinkedHashMap<>();

for (int i = 0; i < columns.length; i++) {
String columnName = columns[i];
Marker marker = Marker.NONE;

if (columnName.endsWith(FLOAT)) {
columnName = columnName.substring(0, columnName.length() - FLOAT.length());
marker = Marker.FLOAT;
} else if (columnName.endsWith(STRING)) {
columnName = columnName.substring(0, columnName.length() - STRING.length());
marker = Marker.STRING;
}

if (map.put(columnName, marker) != null)
throw new IllegalArgumentException("Column duplication in the source file");
}

return map;
}

private static int getRowNumberPosition(Set<String> columnNames) {
int i = 0;

for (String columnName : columnNames) {
if ("row_number".equals(columnName))
return i;
i++;
}

throw new IllegalArgumentException("Source file does not contain 'row_number' column");
}
}
<小时/>
  • 输入文件: data.csv ~432 Mb
  • 输出文件: res.csv ~868 Mb
  • 控制台输出:已用时间:21058 毫秒

关于Java 代码无法处理巨大的 CSV 文件,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/57646338/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com