1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
/**
* Copyright 2019-2023 by XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <fstream>
#include <memory>
#include <vector>
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
#include "xgboost/data.h"
namespace xgboost {
TEST(SparsePage, PushCSC) {
std::vector<bst_idx_t> offset {0};
std::vector<Entry> data;
SparsePage batch;
batch.offset.HostVector() = offset;
batch.data.HostVector() = data;
offset = {0, 1, 4};
for (size_t i = 0; i < offset.back(); ++i) {
data.emplace_back(i, 0.1f);
}
SparsePage other;
other.offset.HostVector() = offset;
other.data.HostVector() = data;
batch.PushCSC(other);
ASSERT_EQ(batch.offset.HostVector().size(), offset.size());
ASSERT_EQ(batch.data.HostVector().size(), data.size());
for (size_t i = 0; i < offset.size(); ++i) {
ASSERT_EQ(batch.offset.HostVector()[i], offset[i]);
}
for (size_t i = 0; i < data.size(); ++i) {
ASSERT_EQ(batch.data.HostVector()[i].index, data[i].index);
}
batch.PushCSC(other);
ASSERT_EQ(batch.offset.HostVector().size(), offset.size());
ASSERT_EQ(batch.data.Size(), data.size() * 2);
for (size_t i = 0; i < offset.size(); ++i) {
ASSERT_EQ(batch.offset.HostVector()[i], offset[i] * 2);
}
auto page = batch.GetView();
auto inst = page[0];
ASSERT_EQ(inst.size(), 2ul);
for (auto entry : inst) {
ASSERT_EQ(entry.index, 0u);
}
inst = page[1];
ASSERT_EQ(inst.size(), 6ul);
std::vector<size_t> indices_sol {1, 2, 3};
for (size_t i = 0; i < inst.size(); ++i) {
ASSERT_EQ(inst[i].index, indices_sol[i % 3]);
}
}
TEST(SparsePage, PushCSCAfterTranspose) {
bst_idx_t constexpr kRows = 1024, kCols = 21;
auto dmat =
RandomDataGenerator{kRows, kCols, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true);
const int ncols = dmat->Info().num_col_;
SparsePage page; // Consolidated sparse page
for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
// Transpose each batch and push
SparsePage tmp = batch.GetTranspose(ncols, AllThreadsForTest());
page.PushCSC(tmp);
}
// Make sure that the final sparse page has the right number of entries
ASSERT_EQ(kRows * kCols, page.data.Size());
page.SortRows(AllThreadsForTest());
auto v = page.GetView();
for (size_t i = 0; i < v.Size(); ++i) {
auto column = v[i];
for (size_t j = 1; j < column.size(); ++j) {
ASSERT_GE(column[j].fvalue, column[j - 1].fvalue);
}
}
}
TEST(SparsePage, SortIndices) {
auto p_fmat = RandomDataGenerator{100, 10, 0.6}.GenerateDMatrix();
auto n_threads = AllThreadsForTest();
SparsePage copy;
for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
ASSERT_TRUE(page.IsIndicesSorted(n_threads));
copy.Push(page);
}
ASSERT_TRUE(copy.IsIndicesSorted(n_threads));
for (size_t ridx = 0; ridx < copy.Size(); ++ridx) {
auto beg = copy.offset.HostVector()[ridx];
auto end = copy.offset.HostVector()[ridx + 1];
auto& h_data = copy.data.HostVector();
if (end - beg >= 2) {
std::swap(h_data[beg], h_data[end - 1]);
}
}
ASSERT_FALSE(copy.IsIndicesSorted(n_threads));
copy.SortIndices(n_threads);
ASSERT_TRUE(copy.IsIndicesSorted(n_threads));
}
TEST(DMatrix, Uri) {
auto constexpr kRows {16};
auto constexpr kCols {8};
dmlc::TemporaryDirectory tmpdir;
auto const path = tmpdir.path + "/small.csv";
CreateTestCSV(path, kRows, kCols);
std::unique_ptr<DMatrix> dmat;
// FIXME(trivialfis): Enable the following test by restricting csv parser in dmlc-core.
// EXPECT_THROW(dmat.reset(DMatrix::Load(path, false, true)), dmlc::Error);
std::string uri = path + "?format=csv";
dmat.reset(DMatrix::Load(uri, false));
ASSERT_EQ(dmat->Info().num_col_, kCols);
ASSERT_EQ(dmat->Info().num_row_, kRows);
}
} // namespace xgboost
|