Avoiding data races using multi-threading and accessing a shared vector in Julia

103 views Asked by At

I have a code pattern that looks something like this:

function output(G)
    hashes = []
    objects = []
    for g ∈ G
        ng = compute(g)
        h = hash(ng)
        if h ∉ hashes
            push!(hashes, h)
            push!(objects, ng)
        end
    end
    return objects
end

To speed it up, I want to parallelize the loop using the @threads macro (the compute(g) function is by far the most expensive part of the loop, so I expect multi-threading to help even if it introduces some overhead).

According to the Julia documentation, to avoid a data race I need to use a lock. I tried to modify the function as follows:

function parallel_output(G)
    hashes = []
    objects = []
    lk = ReentrantLock()
    @threads for g ∈ G
        ng = compute(g)
        h = hash(ng)

        lock(lk) do
            if h ∉ hashes
                push!(hashes, h)
                push!(objects, ng)
            end
        end
    end
    return objects
end

But even with the lock, it still returns the wrong answer. I.e. the objects vector has a different size depending on whether the function runs single-threaded or multi-threaded.

Is there a better way to parallelize this type of loop? or another way to avoid a data race?

0

There are 0 answers