Exception in thread "main" java.lang.OutOfMemoryError: Java heap space while loading bulk data


Amyth Arora <aroras....@...>
 

Hi Everyone,

I am trying to upload some dummy data for testing purposes to janusgraph (google cloud bigtable backend). I have a groovy script as follows that I execute while running the gremlin console that creates the schema, indexes, vertexes and edges.

import groovy.json.JsonSlurper;
import java.util.ArrayList;
import org.apache.tinkerpop.gremlin.process.traversal.dsl.graph.GraphTraversalSource;
import org.janusgraph.core.JanusGraphFactory;
import org.janusgraph.core.PropertyKey;
import org.janusgraph.core.Multiplicity;
import org.janusgraph.core.schema.SchemaAction;
import org.janusgraph.core.schema.SchemaStatus;
import org.janusgraph.core.util.JanusGraphCleanup;
import org.janusgraph.graphdb.database.StandardJanusGraph;
import org.janusgraph.graphdb.database.management.ManagementSystem;

/**
 * Given a json file, populates data into the given JanusGraph DB
 */
class JanusGraphBuilder {

    String graphPath;
    StandardJanusGraph graph;
    ManagementSystem management;
    GraphTraversalSource traversal;

    def dummyData;


    public void main(String jsonPath, String janusGraphPath) {
        this.graphPath  = janusGraphPath
        this.initGraph()
        this.initialize(jsonPath)
        this.populate()
    }

    public void createEdges(def edges) {
        println "Preparing edges."
        edges.each {
            def relation = it.edge
            def properties = it.properties
            def vertexFrom = this.traversal.V().has("uid", it.nodes[0])[0]
            def vertexTo = this.traversal.V().has("uid", it.nodes[1])[0]
            def newEdge = vertexFrom.addEdge(relation, vertexTo)
            properties.each {
                if (it.key == 'score') {
                    it.value = Float.parseFloat(it.value.toString())
                }
                newEdge.property(it.key, it.value)
            }
        }
        this.graph.tx().commit()
        println "Created edges successfully"
    }

    public void createVertexes(def vertexes) {
        println "Preparing vertices."
        vertexes.each {
            def uniqueLabel = it.labels[0]
            def properties = it.properties
            def newVertex = this.graph.addVertex(label, uniqueLabel)
            properties.each {
                newVertex.property(it.key, it.value)
            }
        }
        this.graph.tx().commit()
        println "Created vertices successfully"
    }

    public void createSchema() {
        println "Preparing schema."
        // Do not create indexes while another transaction is in progress
        this.graph.tx().rollback()
        this.management = this.graph.openManagement()
        this.management.set('ids.block-size', 20000000)

        // Make property keys
        def uid = this.management.makePropertyKey("uid").dataType(String.class).make()
        def name = this.management.makePropertyKey("name").dataType(String.class).make()
        def number = this.management.makePropertyKey("number").dataType(String.class).make()
        def email = this.management.makePropertyKey("email").dataType(String.class).make()
        def score = this.management.makePropertyKey("score").dataType(Float.class).make()
        def linkedinId = this.management.makePropertyKey("linkedin_id").dataType(String.class).make()
        def linkedinUrl = this.management.makePropertyKey("profile_url").dataType(String.class).make()
        def imageUrl = this.management.makePropertyKey("image_url").dataType(String.class).make()
        def instituteName = this.management.makePropertyKey("institute_name").dataType(String.class).make()
        def companyName = this.management.makePropertyKey("company_name").dataType(String.class).make()
        def jobId = this.management.makePropertyKey("job_id").dataType(String.class).make()

        // Define Vertex Labels
        this.management.makeVertexLabel("person").make();
        this.management.makeVertexLabel("candidate").make();
        this.management.makeVertexLabel("recruiter").make();
        this.management.makeVertexLabel("employee").make();
        this.management.makeVertexLabel("linkedin").make();
        this.management.makeVertexLabel("job").make();
        this.management.makeVertexLabel("company").make();
        this.management.makeVertexLabel("institute").make();
        def phoneV = this.management.makeVertexLabel("phone").make();
        def emailV = this.management.makeVertexLabel("email").make();

        // Define Edge Labels
        this.management.makeEdgeLabel("knows").make();
        this.management.makeEdgeLabel("has").make();
        this.management.makeEdgeLabel("provided_by").make();
        this.management.makeEdgeLabel("studied_at").make();
        this.management.makeEdgeLabel("worked_at").make();
        this.management.makeEdgeLabel("posted").make();
        this.management.makeEdgeLabel("liked").make();
        this.management.makeEdgeLabel("worked_with").make();
        this.management.makeEdgeLabel("studied_with").make();
        this.management.makeEdgeLabel("is_a_match_for").make();

        // Create indexes
        this.management.buildIndex('uniqueUid', Vertex.class).addKey(uid).unique().buildCompositeIndex()
        this.management.buildIndex('uniqueEmail', Vertex.class).addKey(email).indexOnly(emailV).unique().buildCompositeIndex()
        this.management.buildIndex('uniqueNumber', Vertex.class).addKey(number).indexOnly(phoneV).unique().buildCompositeIndex()
        this.management.commit()
        this.management.awaitGraphIndexStatus(this.graph, 'uniqueUid').call()
        this.management = this.graph.openManagement()
        this.management.updateIndex(this.management.getGraphIndex('uniqueUid'), SchemaAction.REINDEX).get()

        this.management.commit()

        println "Created schema successfully"
    }

    public void populate() {
        // Create db schema
        this.createSchema()

        // Create vertexes from the given dummy data
        def vertexTransaction = this.graph.newTransaction()
        def vertexes = this.dummyData.vertexes;
        this.createVertexes(vertexes)
        vertexTransaction.commit()
        this.initGraph()

        def edgeTransaction = this.graph.newTransaction()
        // Create edges from the given dummy data
        def edges = this.dummyData.edges;
        this.createEdges(edges)
        edgeTransaction.commit()
        this.initGraph()

        println "Graph population successfully accomplished. Please hit Ctrl+C to exit."
    }

    public void initialize(String jsonPath) {
        String fileContents = new File(jsonPath).getText('UTF-8')
        def slurper = new JsonSlurper()
        def results = slurper.parseText(fileContents)
        this.dummyData = results;
        this.resetData()
    }

    public void resetData() {
        // Remove all the data from the storage backend
        this.graph.close()
        JanusGraphCleanup.clear(this.graph)
        this.initGraph()
    }

    public void initGraph() {
        this.graph = JanusGraphFactory.open(this.graphPath)
        this.traversal = this.graph.traversal()
    }
}


JanusGraphBuilder graphBuilder = new JanusGraphBuilder()
graphBuilder.main("/tmp/dummy.json", "conf/testconf.properties")

I have already updated the heap size in the  `JAVA_OPTIONS` environment variable to `-Xmx2048m`. Also here is how my configuration looks like.

storage.backend=hbase

## Google cloud BIGTABLE configuration options
storage.hbase.ext.hbase.client.connection.impl=com.google.cloud.bigtable.hbase1_0.BigtableConnection
storage.hbase.ext.google.bigtable.project.id=my-project-id
storage.hbase.ext.google.bigtable.instance.id=my-instance-id

storage.hostname=localhost
cache.db-cache = true
cache.db-cache-clean-wait = 20
cache.db-cache-time = 180000
cache.db-cache-size = 0.5

# Bulk Loading
storage.batch-loading = true
ids.block-size=20000000

I execute the script using the following command:

./bin/gremlin.sh -e ~/projects/xonnect/utils/population_scripts/populateJanus.groovy

The problem is, even before the schema is created it throws out of memory error for java heap space. Following is the output.

18:44:29,112  INFO BigtableSession:75 - Bigtable options: BigtableOptions{dataHost=bigtable.googleapis.com, tableAdminHost=bigtableadmin.googleapis.com, instanceAdminHost=bigtableadmin.googleapis.com, projectId=formal-theater-175812, instanceId=xonnect, userAgent=hbase-1.2.4, credentialType=DefaultCredentials, port=443, dataChannelCount=10, retryOptions=RetryOptions{retriesEnabled=true, allowRetriesWithoutTimestamp=false, statusToRetryOn=[UNAUTHENTICATED, INTERNAL, ABORTED, UNAVAILABLE, DEADLINE_EXCEEDED], initialBackoffMillis=5, maxElapsedBackoffMillis=60000, backoffMultiplier=2.0, streamingBufferSize=60, readPartialRowTimeoutMillis=60000, maxScanTimeoutRetries=3}, bulkOptions=BulkOptions{asyncMutatorCount=2, useBulkApi=true, bulkMaxKeyCount=25, bulkMaxRequestSize=1048576, autoflushMs=0, maxInflightRpcs=500, maxMemory=190893260, enableBulkMutationThrottling=false, bulkMutationRpcTargetMs=100}, callOptionsConfig=CallOptionsConfig{useTimeout=false, shortRpcTimeoutMs=60000, longRpcTimeoutMs=600000}, usePlaintextNegotiation=false}.
18:44:30,851  INFO Backend:183 - Initiated backend operations thread pool of size 8
18:44:35,048  INFO IndexSerializer:85 - Hashing index keys
18:44:36,910  INFO KCVSLog:744 - Loaded unidentified ReadMarker start time 2017-08-08T13:14:36.898Z into org.janusgraph.diskstorage.log.kcvs.KCVSLog$MessagePuller@35835fa
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:3332)
at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:124)
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:596)
at java.lang.StringBuilder.append(StringBuilder.java:190)
at org.codehaus.groovy.runtime.IOGroovyMethods.getText(IOGroovyMethods.java:886)
at org.codehaus.groovy.runtime.ResourceGroovyMethods.getText(ResourceGroovyMethods.java:588)
at org.codehaus.groovy.runtime.dgm$964.invoke(Unknown Source)
at org.codehaus.groovy.runtime.callsite.PojoMetaMethodSite$PojoMetaMethodSiteNoUnwrapNoCoerce.invoke(PojoMetaMethodSite.java:274)
at org.codehaus.groovy.runtime.callsite.PojoMetaMethodSite.call(PojoMetaMethodSite.java:56)
at org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:48)
at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:113)
at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:125)
at JanusGraphBuilder.initialize(Script1.groovy:146)
at JanusGraphBuilder.main(Script1.groovy:29)
at JanusGraphBuilder$main.call(Unknown Source)
at org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:48)
at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:113)
at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:133)
at Script1.run(Script1.groovy:168)
at org.apache.tinkerpop.gremlin.groovy.jsr223.GremlinGroovyScriptEngine.eval(GremlinGroovyScriptEngine.java:619)
at org.apache.tinkerpop.gremlin.groovy.jsr223.GremlinGroovyScriptEngine.eval(GremlinGroovyScriptEngine.java:448)
at org.apache.tinkerpop.gremlin.groovy.jsr223.GremlinGroovyScriptEngine.eval(GremlinGroovyScriptEngine.java:421)
at javax.script.AbstractScriptEngine.eval(AbstractScriptEngine.java:212)
at org.apache.tinkerpop.gremlin.groovy.jsr223.ScriptExecutor.evaluate(ScriptExecutor.java:55)
at org.apache.tinkerpop.gremlin.groovy.jsr223.ScriptExecutor.main(ScriptExecutor.java:44)

Any help will be much appreciated. Thanks.


Amyth (twiiter.com/mytharora)

Join janusgraph-users@lists.lfaidata.foundation to automatically receive all group messages.