gpt4 book ai didi

java - 在内存中存储大 map

转载 作者:塔克拉玛干 更新时间:2023-11-01 23:06:33 32 4
gpt4 key购买 nike

首先是问题的背景:我有一个非常大的图表,存储成本约为 4GB。大约 3M 个节点和 34M 个边。我的程序采用这个大图并从中递归构建较小的图。在递归的每个级别,我都有两个图 - 原始图和从原始图创建的图。递归一直持续到图被缩减为非常小的图,比如大约 10 个节点。

由于我在整个程序执行过程中都需要这些图表,因此内存效率对我的应用程序至关重要。

这是我目前遇到的问题:这是从大图创建小图的算法:

public static Graph buildByTriples(Graph g, ArrayList<Integer> seeds) {
ArrayList<Edge> edges = new ArrayList(g.getEdgeCount());
for (int i = 0; i < g.size(); i++) {
for (Edge e : g.adj(i)) {
int v = e.getEndpoint(i);
if (i < v) {
edges.add(e);
}
}
}

Table<Integer, Integer, Double> coarseEgdes = HashBasedTable.create(seeds.size(),seeds.size());
//compute coarse weights
edges.stream().forEach((e) -> {
int v = e.getV();
int u = e.getU();
if (g.isC(u) && g.isC(v)) {
addToTable(coarseEgdes, u, v, e.getWeight());
}else if(!g.isC(u) && g.isC(v)){ //F-C
for(Edge cEdge: g.cAdj(u)){//get coarse neighbors of the fine edges
int nb = cEdge.getEndpoint(u);
if(nb != v){
addToTable(coarseEgdes, v, nb, cEdge.getPij() * e.getWeight());

}
}
}else if(g.isC(u) && !g.isC(v)){//C-F
for(Edge cEdge: g.cAdj(v)){//get coarse neighbors of the fine edges
int nb = cEdge.getEndpoint(v);
if(nb != u){
addToTable(coarseEgdes, u, nb, cEdge.getPij() * e.getWeight());
}
}
}else{//F-F
for(Edge cEdgeU: g.cAdj(u)){//get coarse neighbors of the fine edges
int uNb = cEdgeU.getEndpoint(u);
for(Edge cEdgeV: g.cAdj(v)){
int vNb = cEdgeV.getEndpoint(v);
if(uNb != vNb){
addToTable(coarseEgdes, uNb, vNb, cEdgeU.getPij() * e.getWeight() * cEdgeV.getPij());
}
}
}
}
});

return createGraph(g, coarseEgdes); //use the edges to build new graph. Basically loops through coarseEdges and add edge and weight to the new graph.
}

private static void addToTable(Table<Integer, Integer,Double> tbl, int r, int c, double val){
int mn = Math.min(r, c);//the smaller of the two nodeIds
int mx = Math.min(r, c);//the largest of the two nodeId
if(tbl.contains(mn, mx)){
tbl.put(mn, mx, tbl.get(mn, mx) + val);
}else{
tbl.put(mn, mx,val);
}
}

现在,当我这样做时,我很快就会耗尽内存。我用 YourKit 分析了应用程序,并且内存使用率超过了屋顶(在用完之前超过 6GB),因此 CPU 使用率也是如此。 coarseEdges 可以变得非常大。是否有更好的内存中 Map 实现可以扩展到大型数据集?或者有没有更好的方法在不存储 coarseEdges 的情况下执行此操作?

PS:请注意,我的图表无法在恒定时间内检索边(u,v)。它基本上是一个列表列表,这可以更好地为我的应用程序的其他关键部分提供性能。

**Also See my graph implementation code below: **
public class Graph{
private final int SIZE;
private final EdgeList[] nodes;
private final float[] volumes;
private final double[] weightedSum;
private final double[] weightedCoarseSum;
private final int[] nodeDegrees;
private final int[] c_nodeDegrees;
private int edge_count=0;
private final boolean[] coarse;
private final EdgeList[] coarse_neighbors;
public Graph(int SIZE){
this.SIZE =SIZE;
nodes = new EdgeList[SIZE];
coarse_neighbors = new EdgeList[SIZE];

volumes = new float[SIZE];
coarse = new boolean[SIZE];

//initialize data
weightedSum = new double[SIZE];
weightedCoarseSum = new double[SIZE];
nodeDegrees= new int[SIZE];
c_nodeDegrees = new int[SIZE];

for(int i=0;i<SIZE;i++){
nodes[i]=new EdgeList();
coarse_neighbors[i] = new EdgeList();
volumes[i]=1;
}
}

public void addEdge(int u, int v, double w){
//graph is undirected
//In order to traverse edges in order such that u < v. We store edge u,v such that u<v
Edge e=null;
if(u<v){
e = new Edge(u,v,w);
}else if(u>v){
e = new Edge(v,u,w);
}else{
throw new UnsupportedOperationException("Self loops not allowed in graph"); //TODO: Need a graph validation routine
}

nodes[u].add(e);
nodes[v].add(e);

//update the weighted sum of each edge
weightedSum[u] += w;
weightedSum[v] += w;

//update the degree of each edge
++nodeDegrees[u];
++nodeDegrees[v];

++edge_count;
}

public int size(){
return SIZE;
}

public EdgeList adj(int v){
return nodes[v];
}

public EdgeList cAdj(int v){
return coarse_neighbors[v];
}

public void sortAdj(int u, Comparator<Edge> c){
nodes[u].sort(c);
}

public void sortCoarseAdj(int u, Comparator<Edge> c){
coarse_neighbors[u].sort(c);
}

public void setCoarse(int node, boolean c){
coarse[node] = c;
if(c){
//update the neighborHood of node
for(Edge e: adj(node)){
int v = e.getEndpoint(node);
coarse_neighbors[v].add(e);
weightedCoarseSum[v] += e.getWeight();
++c_nodeDegrees[v];
}
}
}

public int getEdgeCount(){
return edge_count;
}

public boolean isC(int id){
return coarse[id];
}

public double weightedDegree(int node){
return weightedSum[node];
}

public double weightedCoarseDegree(int node){
return weightedCoarseSum[node];
}

public int degree(int u){
return nodeDegrees[u];
}

public int cDegree(int u){
return c_nodeDegrees[u];
}

public Edge getCNeighborAt(int u,int idx){
return coarse_neighbors[u].getAt(idx);
}

public float volume(int u){
return volumes[u];
}

public void setVolume(int node, float v){
volumes[node] = v;
}

@Override
public String toString() {
return "Graph[nodes:"+SIZE+",edges:"+edge_count+"]";
}

}


//Edges are first class objects.
public class Edge {
private boolean deleted=false;
private int u;
private int v;
private double weight;
private double pij;
private double algebraicDist = (1/Constants.EPSILON);

public Edge(int u, int v, double weight) {
this.u = u;
this.v = v;
this.weight = weight;
}

public Edge() {
}

public int getU() {
return u;
}

public void setU(int u) {
this.u = u;
}

public int getV() {
return v;
}

public void setV(int v) {
this.v = v;
}

public int getEndpoint(int from){
if(from == v){
return u;
}

return v;
}

public double getPij() {
return pij;
}

public void setPij(double pij) {
this.pij = pij;
}

public double getAlgebraicDist() {
return algebraicDist;
}

public void setAlgebraicDist(double algebraicDist) {
this.algebraicDist = algebraicDist;
}

public boolean isDeleted() {
return deleted;
}

public void setDeleted(boolean deleted) {
this.deleted = deleted;
}

public double getWeight() {
return weight;
}

public void setWeight(double weight) {
this.weight = weight;
}

@Override
public String toString() {
return "Edge[u:"+u+", v:"+v+"]";
}
}


// The Edge iterable
public class EdgeList implements Iterable<Edge>{
private final ArrayList<Edge> data= new ArrayList();

public void add(Edge e){
data.add(e);
}

@Override
public Iterator<Edge> iterator() {
Iterator<Edge> it = new IteratorImpl();
return it;
}

private class IteratorImpl implements Iterator<Edge> {

public IteratorImpl() {
}
private int currentIndex = 0;
private final int N = data.size();
@Override
public boolean hasNext() {

//skip deleted
while(currentIndex < N && data.get(currentIndex).isDeleted()){
currentIndex++;
}

return currentIndex < N;
}

@Override
public Edge next() {
return data.get(currentIndex++);
}

@Override
public void remove() {
throw new UnsupportedOperationException();
}
}

public Edge getAt(int idx){
return data.get(idx);
}

public void sort(Comparator<Edge> c){
data.sort(c);
}
}

最佳答案

这里有一些盲点 - 您需要实现它们才能看到它有多大帮助。

1) 您可能会考虑将组合键 (int,int) 与 hashmap 一起使用,而不是 guava 表。对于边缘权重来说肯定会更有效。如果您需要查询从某个顶点传出的边,那么它就不那么明显了,但是您需要查看 cpu 与内存的权衡。

2) 如果你使用普通的 hashmap,你可以考虑使用一种堆外实现。看看https://github.com/OpenHFT/Chronicle-Map例如,它可能

3) 如果你留在内存中,想挤出一些额外的空间,你可以用原始 map 做一些肮脏的把戏。使用 long->double 映射,例如 http://labs.carrotsearch.com/download/hppc/0.4.1/api/com/carrotsearch/hppc/LongDoubleMap.htmlhttp://trove4j.sourceforge.net/javadocs/gnu/trove/map/hash/TLongDoubleHashMap.html ,将您的 2xint 顶点对编码为 long 并查看它有多大帮助。如果您使用的是 64 位,Integer 可以占用 16 个字节(假设压缩 oops),Double 24 个字节 - 每个条目提供 32+24=56 个字节,而原始映射为 8+8 个字节

关于java - 在内存中存储大 map ,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/38920824/

32 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com