File: Makefile

package info (click to toggle)
halide 21.0.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 55,752 kB
  • sloc: cpp: 289,334; ansic: 22,751; python: 7,486; makefile: 4,299; sh: 2,508; java: 1,549; javascript: 282; pascal: 207; xml: 127; asm: 9
file content (323 lines) | stat: -rw-r--r-- 14,014 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
include ../support/Makefile.inc

AR ?= ar
BIN ?= bin
BUILD = $(BIN)/build
CXXFLAGS += -Wall -march=native -I /opt/local/include
HL_TARGET ?= host
LIBHALIDE_BLAS = $(BIN)/libhalide_blas.a
HALIDEBLAS_FLAGS ?= -DUSE_HALIDE
EMIT_OPTIONS = stmt,assembly,object,c_header

EIGEN_INCLUDES ?= -I/usr/include/eigen3
CBLAS_LIBS ?= -lblas
CBLAS_FLAGS ?= -DUSE_CBLAS
OPENBLAS_FLAGS ?= -DUSE_OPENBLAS
OPENBLAS_LIBS ?= -L/opt/local/lib -lopenblas

# ATLAS should be built and installed locally to get the best performance.
# It is designed to automatically tune its performance to your machine during
# the build. Get the source code here: http://math-atlas.sourceforge.net/,
# then set the flags below to point to your local build of ATLAS.
#
# For example:
# ATLAS_FLAGS ?= -DUSE_ATLAS -I/opt/ATLAS/include
# ATLAS_LIBS ?= -L/opt/ATLAS/lib -lptcblas -latlas

# By default use whatever ATLAS is installed, so that this at least builds.
$(warning Warning: Defaulting to system ATLAS, which may be slow. See the Makefile for details.)
ATLAS_FLAGS ?= -DUSE_ATLAS
ATLAS_LIBS ?= -lcblas

# Note that we deliberately build the generators with the no_runtime flag;
# this provides a slight build speed increase (since we don't have to redundantly
# include the runtime code in each generator) with the extra complication that we
# must explicitly build and link the halide runtime separately.
HL_TARGET_NR = $(HL_TARGET)-no_runtime-no_bounds_query

# The L1 blas kernels are a small number of floating point ops each,
# so the asserts make a difference. They are scheduled to work for any
# size input so we'll just omit the asserts.
HL_TARGET_NR_NA = $(HL_TARGET_NR)-no_asserts

KERNELS = \
	scopy_impl \
	dcopy_impl \
	sscal_impl \
	dscal_impl \
	saxpy_impl \
	daxpy_impl \
	sdot \
	ddot \
	sasum \
	dasum \
	sgemv_notrans \
	dgemv_notrans \
	sgemv_trans \
	dgemv_trans \
	sger_impl \
	dger_impl \
	sgemm_notrans \
	dgemm_notrans \
	sgemm_transA \
	dgemm_transA \
	sgemm_transB \
	dgemm_transB \
	sgemm_transAB \
	dgemm_transAB \

BENCHMARKS = \
	$(BIN)/cblas_benchmarks \
	$(BIN)/atlas_benchmarks \
	$(BIN)/openblas_benchmarks \
	$(BIN)/eigen_benchmarks \
	$(BIN)/halide_benchmarks

.PHONY: build clean run_benchmarks test
all: build
	make run_benchmarks 

# This is a hack: disable this test when compiling 32-bit systems, as it's hard to find the right 32-bit versions
# of these libraries on 64-bit hosts. Can't rely on HL_TARGET because it might be 'host' even for cross-compiling.
# Look instead for `-m32` being passed to CXX, which is the cross-compiling flag we use. This is regrettable
# but expedient. (Note that CMake is able to find this correctly, and so we have test coverage there; this is
# simply not worth debugging as an edge case at the moment.)
ifneq (,$(findstring -m32,$(CXX)))
build:
	@echo linear_algebra not support using Make on 32-bit systems: skipping linear_algebra tests...
test: build
else
ifneq ("$(wildcard /usr/include/cblas.h /usr/include/*/cblas.h)","")
build: $(BENCHMARKS) $(BIN)/test_halide_blas
test: $(BIN)/test_halide_blas
	$(BIN)/test_halide_blas
else
build:
	@echo /usr/include/cblas.h not found: skipping linear_algebra tests...
test: build
endif
endif

clean:
	rm -rf $(BIN)

KERNEL_HEADERS = $(KERNELS:%=$(BUILD)/halide_%.h)
KERNEL_OBJECTS = $(KERNELS:%=$(BUILD)/halide_%.o) $(BUILD)/halide_runtime.o

$(BUILD)/halide_blas.o: src/halide_blas.cpp src/halide_blas.h $(KERNEL_HEADERS)
	@mkdir -p $(@D)
	$(CXX) $(CXXFLAGS) -c -o $(@) -I ../../include/ -I ../support -I$(BUILD) $(<)

$(LIBHALIDE_BLAS): $(KERNEL_OBJECTS) $(BUILD)/halide_blas.o
	$(AR) q $@ $(filter-out %.a,$^)

$(BIN)/test_halide_blas: tests/test_halide_blas.cpp $(LIBHALIDE_BLAS)
	$(CXX) $(CXXFLAGS) -Wno-unused-variable -o $(@) -I../../include/ -I../support -Isrc -I$(BUILD) \
	$(CBLAS_FLAGS) $^ $(CBLAS_LIBS) $(LDFLAGS)

# Large powers of two are a pathological case for the cache, so avoid
# them for the benchmarks.
BENCHMARK_SIZES = 64 128 256 512 1280 2560
L1_BENCHMARKS = scopy dcopy sscal dscal saxpy daxpy sdot ddot sasum dasum
L2_BENCHMARKS = sgemv_notrans dgemv_notrans sgemv_trans dgemv_trans sger dger
L3_BENCHMARKS = sgemm_notrans dgemm_notrans sgemm_transA dgemm_transA sgemm_transB dgemm_transB sgemm_transAB dgemm_transAB

cblas_l1_benchmark_%: $(BIN)/cblas_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/cblas_benchmarks $(@:cblas_l1_benchmark_%=%) $(size);)

atlas_l1_benchmark_%: $(BIN)/atlas_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/atlas_benchmarks $(@:atlas_l1_benchmark_%=%) $(size);)

openblas_l1_benchmark_%: $(BIN)/openblas_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/openblas_benchmarks $(@:openblas_l1_benchmark_%=%) $(size);)

eigen_l1_benchmark_%: $(BIN)/eigen_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/eigen_benchmarks $(@:eigen_l1_benchmark_%=%) $(size);)

halide_l1_benchmark_%: $(BIN)/halide_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/halide_benchmarks $(@:halide_l1_benchmark_%=%) $(size);)

l1_benchmarks: \
	$(L1_BENCHMARKS:%=cblas_l1_benchmark_%) \
	$(L1_BENCHMARKS:%=atlas_l1_benchmark_%) \
	$(L1_BENCHMARKS:%=openblas_l1_benchmark_%) \
	$(L1_BENCHMARKS:%=eigen_l1_benchmark_%) \
	$(L1_BENCHMARKS:%=halide_l1_benchmark_%)

cblas_l2_benchmark_%: $(BIN)/cblas_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/cblas_benchmarks $(@:cblas_l2_benchmark_%=%) $(size);)

atlas_l2_benchmark_%: $(BIN)/atlas_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/atlas_benchmarks $(@:atlas_l2_benchmark_%=%) $(size);)

openblas_l2_benchmark_%: $(BIN)/openblas_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/openblas_benchmarks $(@:openblas_l2_benchmark_%=%) $(size);)

eigen_l2_benchmark_%: $(BIN)/eigen_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/eigen_benchmarks $(@:eigen_l2_benchmark_%=%) $(size);)

halide_l2_benchmark_%: $(BIN)/halide_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/halide_benchmarks $(@:halide_l2_benchmark_%=%) $(size);)

l2_benchmarks: \
	$(L2_BENCHMARKS:%=cblas_l2_benchmark_%) \
	$(L2_BENCHMARKS:%=atlas_l2_benchmark_%) \
	$(L2_BENCHMARKS:%=openblas_l2_benchmark_%) \
	$(L2_BENCHMARKS:%=eigen_l2_benchmark_%) \
	$(L2_BENCHMARKS:%=halide_l2_benchmark_%)

cblas_l3_benchmark_%: $(BIN)/cblas_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/cblas_benchmarks $(@:cblas_l3_benchmark_%=%) $(size);)

atlas_l3_benchmark_%: $(BIN)/atlas_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/atlas_benchmarks $(@:atlas_l3_benchmark_%=%) $(size);)

openblas_l3_benchmark_%: $(BIN)/openblas_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/openblas_benchmarks $(@:openblas_l3_benchmark_%=%) $(size);)

eigen_l3_benchmark_%: $(BIN)/eigen_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/eigen_benchmarks $(@:eigen_l3_benchmark_%=%) $(size);)

halide_l3_benchmark_%: $(BIN)/halide_benchmarks
	@$(foreach size,$(BENCHMARK_SIZES),$(BIN)/halide_benchmarks $(@:halide_l3_benchmark_%=%) $(size);)

l3_benchmarks: \
	$(L3_BENCHMARKS:%=cblas_l3_benchmark_%) \
	$(L3_BENCHMARKS:%=atlas_l3_benchmark_%) \
	$(L3_BENCHMARKS:%=openblas_l3_benchmark_%) \
	$(L3_BENCHMARKS:%=eigen_l3_benchmark_%) \
	$(L3_BENCHMARKS:%=halide_l3_benchmark_%)

run_benchmarks: $(BENCHMARKS)
	@echo " Package     Subroutine    Size             Runtime     GFLOPS"
	@make --no-print-directory l1_benchmarks
	@make --no-print-directory l2_benchmarks
	@make --no-print-directory l3_benchmarks

benchmarks.csv: $(BENCHMARKS)
	make --no-print-directory run_benchmarks > benchmarks.dat
	awk '{printf("%s,%s,%s,%s,%s\n",$$1,$$2,$$3,$$4,$$5)}' benchmarks.dat > benchmarks.csv

$(BIN)/cblas_benchmarks: benchmarks/cblas_benchmarks.cpp benchmarks/clock.h benchmarks/macros.h
	@mkdir -p $(@D)
	$(CXX) $(CXXFLAGS) -o $(@) -I$(BUILD) $(CBLAS_FLAGS) $(<) $(CBLAS_LIBS)

$(BIN)/atlas_benchmarks: benchmarks/cblas_benchmarks.cpp benchmarks/clock.h benchmarks/macros.h
	@mkdir -p $(@D)
	$(CXX) $(CXXFLAGS) -o $(@) -I$(BUILD) $(ATLAS_FLAGS) $(<) $(ATLAS_LIBS)

$(BIN)/openblas_benchmarks: benchmarks/cblas_benchmarks.cpp benchmarks/clock.h benchmarks/macros.h
	@mkdir -p $(@D)
	$(CXX) $(CXXFLAGS) -o $(@) -I$(BUILD) $(OPENBLAS_FLAGS) $(<) $(OPENBLAS_LIBS)

$(BIN)/eigen_benchmarks: benchmarks/eigen_benchmarks.cpp benchmarks/clock.h benchmarks/macros.h
	@mkdir -p $(@D)
	$(CXX) $(CXXFLAGS) -o $(@) -I$(BUILD) $(EIGEN_INCLUDES) $(<)

$(BIN)/halide_benchmarks: benchmarks/halide_benchmarks.cpp benchmarks/clock.h benchmarks/macros.h $(LIBHALIDE_BLAS)
	@mkdir -p $(@D)
	$(CXX) $(CXXFLAGS) -o $(@) -Isrc -I$(BUILD) $(HALIDEBLAS_FLAGS) $(<) $(LIBHALIDE_BLAS) $(LDFLAGS)

$(BUILD)/%.generator: src/%_generators.cpp $(GENERATOR_DEPS)
	@mkdir -p $(@D)
	$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) -o $@

# This can use any of the generators; pick an arbitrary one
$(BUILD)/halide_runtime.o: $(BUILD)/blas_l1.generator
	$< -o $(BUILD) -e object -r halide_runtime target=$(HL_TARGET)

$(BUILD)/halide_scopy_impl.o $(BUILD)/halide_scopy_impl.h: $(BUILD)/blas_l1.generator
	$< -g saxpy -f halide_scopy_impl -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true scale_x=false add_to_y=false

$(BUILD)/halide_dcopy_impl.o $(BUILD)/halide_dcopy_impl.h: $(BUILD)/blas_l1.generator
	$< -g daxpy -f halide_dcopy_impl -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true scale_x=false add_to_y=false

$(BUILD)/halide_sscal_impl.o $(BUILD)/halide_sscal_impl.h: $(BUILD)/blas_l1.generator
	$< -g saxpy -f halide_sscal_impl -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true scale_x=true add_to_y=false

$(BUILD)/halide_dscal_impl.o $(BUILD)/halide_dscal_impl.h: $(BUILD)/blas_l1.generator
	$< -g daxpy -f halide_dscal_impl -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true scale_x=true add_to_y=false

$(BUILD)/halide_saxpy_impl.o $(BUILD)/halide_saxpy_impl.h: $(BUILD)/blas_l1.generator
	$< -g saxpy -f halide_saxpy_impl -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true scale_x=true add_to_y=true

$(BUILD)/halide_daxpy_impl.o $(BUILD)/halide_daxpy_impl.h: $(BUILD)/blas_l1.generator
	$< -g daxpy -f halide_daxpy_impl -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true scale_x=true add_to_y=true

$(BUILD)/halide_sdot.o $(BUILD)/halide_sdot.h: $(BUILD)/blas_l1.generator
	$< -g sdot -f halide_sdot -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true

$(BUILD)/halide_ddot.o $(BUILD)/halide_ddot.h: $(BUILD)/blas_l1.generator
	$< -g ddot -f halide_ddot -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true

$(BUILD)/halide_sasum.o $(BUILD)/halide_sasum.h: $(BUILD)/blas_l1.generator
	$< -g sasum -f halide_sasum -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true

$(BUILD)/halide_dasum.o $(BUILD)/halide_dasum.h: $(BUILD)/blas_l1.generator
	$< -g dasum -f halide_dasum -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR_NA) vectorize=true

$(BUILD)/halide_sgemv_notrans.o $(BUILD)/halide_sgemv_notrans.h: $(BUILD)/blas_l2.generator
	$< -g sgemv -f halide_sgemv_notrans -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) parallel=true vectorize=true transpose=false

$(BUILD)/halide_dgemv_notrans.o $(BUILD)/halide_dgemv_notrans.h: $(BUILD)/blas_l2.generator
	$< -g dgemv -f halide_dgemv_notrans -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) parallel=true vectorize=true transpose=false

$(BUILD)/halide_sgemv_trans.o $(BUILD)/halide_sgemv_trans.h: $(BUILD)/blas_l2.generator
	$< -g sgemv -f halide_sgemv_trans -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) parallel=true vectorize=true transpose=true

$(BUILD)/halide_dgemv_trans.o $(BUILD)/halide_dgemv_trans.h: $(BUILD)/blas_l2.generator
	$< -g dgemv -f halide_dgemv_trans -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) parallel=true vectorize=true transpose=true

$(BUILD)/halide_sger_impl.o $(BUILD)/halide_sger_impl.h: $(BUILD)/blas_l2.generator
	$< -g sger -f halide_sger_impl -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) parallel=true vectorize=true

$(BUILD)/halide_dger_impl.o $(BUILD)/halide_dger_impl.h: $(BUILD)/blas_l2.generator
	$< -g dger -f halide_dger_impl -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) parallel=true vectorize=true

$(BUILD)/halide_sgemm_notrans.o $(BUILD)/halide_sgemm_notrans.h: $(BUILD)/blas_l3.generator
	$< -g sgemm -f halide_sgemm_notrans -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) transpose_A=false transpose_B=false

$(BUILD)/halide_dgemm_notrans.o $(BUILD)/halide_dgemm_notrans.h: $(BUILD)/blas_l3.generator
	$< -g dgemm -f halide_dgemm_notrans -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) transpose_A=false transpose_B=false

$(BUILD)/halide_sgemm_transA.o $(BUILD)/halide_sgemm_transA.h: $(BUILD)/blas_l3.generator
	$< -g sgemm -f halide_sgemm_transA -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) transpose_A=true transpose_B=false

$(BUILD)/halide_dgemm_transA.o $(BUILD)/halide_dgemm_transA.h: $(BUILD)/blas_l3.generator
	$< -g dgemm -f halide_dgemm_transA -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) transpose_A=true transpose_B=false

$(BUILD)/halide_sgemm_transB.o $(BUILD)/halide_sgemm_transB.h: $(BUILD)/blas_l3.generator
	$< -g sgemm -f halide_sgemm_transB -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) transpose_A=false transpose_B=true

$(BUILD)/halide_dgemm_transB.o $(BUILD)/halide_dgemm_transB.h: $(BUILD)/blas_l3.generator
	$< -g dgemm -f halide_dgemm_transB -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) transpose_A=false transpose_B=true

$(BUILD)/halide_sgemm_transAB.o $(BUILD)/halide_sgemm_transAB.h: $(BUILD)/blas_l3.generator
	$< -g sgemm -f halide_sgemm_transAB -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) transpose_A=true transpose_B=true

$(BUILD)/halide_dgemm_transAB.o $(BUILD)/halide_dgemm_transAB.h: $(BUILD)/blas_l3.generator
	$< -g dgemm -f halide_dgemm_transAB -o $(BUILD) -e $(EMIT_OPTIONS) \
	target=$(HL_TARGET_NR) transpose_A=true transpose_B=true