File: parallel_filter.h

package info (click to toggle)
embree 3.13.5%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 27,924 kB
  • sloc: cpp: 180,815; xml: 3,877; ansic: 2,957; python: 1,466; sh: 502; makefile: 229; csh: 42
file content (93 lines) | stat: -rw-r--r-- 2,877 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "parallel_for.h"

namespace embree
{
  template<typename Ty, typename Index, typename Predicate>
    inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
  {
    Index j = first;
    for (Index i=first; i<last; i++)
      if (predicate(data[i]))
        data[j++] = data[i];

    return j;
  }

  template<typename Ty, typename Index, typename Predicate>
    inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
  {
    /* sequential fallback */
    if (end-begin <= minStepSize)
      return sequential_filter(data,begin,end,predicate);

    /* calculate number of tasks to use */
    enum { MAX_TASKS = 64 };
    const Index numThreads = TaskScheduler::threadCount();
    const Index numBlocks  = (end-begin+minStepSize-1)/minStepSize;
    const Index taskCount  = min(numThreads,numBlocks,(Index)MAX_TASKS);

    /* filter blocks */
    Index nused[MAX_TASKS];
    Index nfree[MAX_TASKS];
    parallel_for(taskCount, [&](const Index taskIndex)
    {
      const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
      const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
      const Index i2 = sequential_filter(data,i0,i1,predicate);
      nused[taskIndex] = i2-i0;
      nfree[taskIndex] = i1-i2;
    });

    /* calculate offsets */
    Index sused=0;
    Index sfree=0;
    Index pfree[MAX_TASKS];
    for (Index i=0; i<taskCount; i++) 
    {
      sused+=nused[i];
      Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
    }

    /* return if we did not filter out any element */
    assert(sfree <= end-begin);
    assert(sused <= end-begin);
    if (sused == end-begin)
      return end;

    /* otherwise we have to copy misplaced elements around */
    parallel_for(taskCount, [&](const Index taskIndex)
    {
      /* destination to write elements to */
      Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
      Index dst_end = min(dst+nfree[taskIndex],begin+sused);
      if (dst_end <= dst) return;

      /* range of misplaced elements to copy to destination */
      Index r0 = pfree[taskIndex];
      Index r1 = r0+dst_end-dst;

      /* find range in misplaced elements in back to front order */
      Index k0=0;
      for (Index i=taskCount-1; i>0; i--)
      {
        if (k0 > r1) break;
        Index k1 = k0+nused[i];
        Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
        for (Index i=max(r0,k0); i<min(r1,k1); i++) {
          Index isrc = src-i+k0-1;
          assert(dst >= begin && dst < end);
          assert(isrc >= begin && isrc < end);
          data[dst++] = data[isrc];
        }
        k0 = k1;
      }
    });

    return begin+sused;
  }
}