/*
 * Copyright (C) 2019-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

#include "shared/source/memory_manager/gfx_partition.h"

#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/heap_assigner.h"
#include "shared/source/helpers/ptr_math.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/utilities/cpu_info.h"
#include "shared/source/utilities/heap_allocator.h"

namespace NEO {

const std::array<HeapIndex, 4> GfxPartition::heap32Names{{HeapIndex::heapInternalDeviceMemory,
                                                          HeapIndex::heapInternal,
                                                          HeapIndex::heapExternalDeviceMemory,
                                                          HeapIndex::heapExternal}};

const std::array<HeapIndex, 8> GfxPartition::heapNonSvmNames{{HeapIndex::heapInternalDeviceMemory,
                                                              HeapIndex::heapInternal,
                                                              HeapIndex::heapExternalDeviceMemory,
                                                              HeapIndex::heapExternal,
                                                              HeapIndex::heapStandard,
                                                              HeapIndex::heapStandard64KB,
                                                              HeapIndex::heapStandard2MB,
                                                              HeapIndex::heapExtended}};

static void reserveLow48BitRangeWithRetry(OSMemory *osMemory, OSMemory::ReservedCpuAddressRange &reservedCpuAddressRange) {
    uint64_t reservationSize = 256 * MemoryConstants::gigaByte;
    constexpr uint64_t minimalReservationSize = 32 * MemoryConstants::gigaByte;

    while (reservationSize >= minimalReservationSize) {
        // With no base address being specified OS always reserve memory in [0x000000000000-0x7FFFFFFFFFFF] range
        reservedCpuAddressRange = osMemory->reserveCpuAddressRange(static_cast<size_t>(reservationSize), GfxPartition::heapGranularity);

        if (reservedCpuAddressRange.alignedPtr) {
            break;
        }

        // Oops... Try again with smaller chunk
        reservationSize = alignDown(static_cast<uint64_t>(reservationSize * 0.9), MemoryConstants::pageSize64k);
    };
}

static void reserveRangeWithMemoryMapsParse(OSMemory *osMemory, OSMemory::ReservedCpuAddressRange &reservedCpuAddressRange, uint64_t areaBase, uint64_t areaTop, uint64_t reservationSize) {
    uint64_t reservationBase = areaBase;

    reservedCpuAddressRange = osMemory->reserveCpuAddressRange(reinterpret_cast<void *>(reservationBase), static_cast<size_t>(reservationSize), MemoryConstants::pageSize64k);

    if (reservedCpuAddressRange.alignedPtr != nullptr) {
        uint64_t alignedPtrU64 = castToUint64(reservedCpuAddressRange.alignedPtr);
        if (alignedPtrU64 >= areaBase && alignedPtrU64 + reservationSize < areaTop) {
            return;
        } else {
            osMemory->releaseCpuAddressRange(reservedCpuAddressRange);
            reservedCpuAddressRange.alignedPtr = nullptr;
        }
    }

    OSMemory::MemoryMaps memoryMaps;
    osMemory->getMemoryMaps(memoryMaps);

    for (size_t i = 0; reservationBase < areaTop && i < memoryMaps.size(); ++i) {
        if (memoryMaps[i].end < areaBase) {
            continue;
        }

        if (memoryMaps[i].start - reservationBase >= reservationSize) {
            break;
        }
        reservationBase = memoryMaps[i].end;
    }

    if (reservationBase + reservationSize < areaTop) {
        reservedCpuAddressRange = osMemory->reserveCpuAddressRange(reinterpret_cast<void *>(reservationBase), static_cast<size_t>(reservationSize), MemoryConstants::pageSize64k);
    }
}

static void reserveHigh48BitRangeWithMemoryMapsParse(OSMemory *osMemory, OSMemory::ReservedCpuAddressRange &reservedCpuAddressRange) {
    constexpr uint64_t high48BitAreaBase = maxNBitValue(47) + 1; // 0x800000000000
    constexpr uint64_t high48BitAreaTop = maxNBitValue(48);      // 0xFFFFFFFFFFFF
    uint64_t reservationSize = MemoryConstants::teraByte;
    reserveRangeWithMemoryMapsParse(osMemory, reservedCpuAddressRange, high48BitAreaBase, high48BitAreaTop, reservationSize);
}

static void reserve57BitRangeWithMemoryMapsParse(OSMemory *osMemory, OSMemory::ReservedCpuAddressRange &reservedCpuAddressRange, uint64_t reservationSize) {
    constexpr uint64_t areaBase = maxNBitValue(48) + 1;
    constexpr uint64_t areaTop = maxNBitValue(56);
    reserveRangeWithMemoryMapsParse(osMemory, reservedCpuAddressRange, areaBase, areaTop, reservationSize);
}

GfxPartition::GfxPartition(OSMemory::ReservedCpuAddressRange &reservedCpuAddressRangeForHeapSvm) : reservedCpuAddressRangeForHeapSvm(reservedCpuAddressRangeForHeapSvm), osMemory(OSMemory::create()) {}

GfxPartition::~GfxPartition() {
    osMemory->releaseCpuAddressRange(reservedCpuAddressRangeForHeapSvm);
    reservedCpuAddressRangeForHeapSvm = {};
    osMemory->releaseCpuAddressRange(reservedCpuAddressRangeForHeapExtended);
}

void GfxPartition::Heap::init(uint64_t base, uint64_t size, size_t allocationAlignment) {
    this->base = base;
    this->size = size;

    auto heapGranularity = GfxPartition::heapGranularity;
    if (allocationAlignment > heapGranularity) {
        heapGranularity = GfxPartition::heapGranularity2MB;
    }

    // Exclude very first and very last 64K from GPU address range allocation
    if (size > 2 * heapGranularity) {
        size -= 2 * heapGranularity;
    }

    alloc = std::make_unique<HeapAllocator>(base + heapGranularity, size, allocationAlignment);
}

void GfxPartition::Heap::initExternalWithFrontWindow(uint64_t base, uint64_t size) {
    this->base = base;
    this->size = size;

    size -= GfxPartition::heapGranularity;

    alloc = std::make_unique<HeapAllocator>(base, size, MemoryConstants::pageSize, 0u);
}

void GfxPartition::Heap::initWithFrontWindow(uint64_t base, uint64_t size, uint64_t frontWindowSize) {
    this->base = base;
    this->size = size;

    // Exclude very very last 64K from GPU address range allocation
    size -= GfxPartition::heapGranularity;
    size -= frontWindowSize;

    alloc = std::make_unique<HeapAllocator>(base + frontWindowSize, size, MemoryConstants::pageSize);
}

void GfxPartition::Heap::initFrontWindow(uint64_t base, uint64_t size) {
    this->base = base;
    this->size = size;

    alloc = std::make_unique<HeapAllocator>(base, size, MemoryConstants::pageSize, 0u);
}

uint64_t GfxPartition::Heap::allocate(size_t &size) {
    return alloc->allocate(size);
}

uint64_t GfxPartition::Heap::allocateWithCustomAlignment(size_t &sizeToAllocate, size_t alignment) {
    return alloc->allocateWithCustomAlignment(sizeToAllocate, alignment);
}

void GfxPartition::Heap::free(uint64_t ptr, size_t size) {
    alloc->free(ptr, size);
}

void GfxPartition::freeGpuAddressRange(uint64_t ptr, size_t size) {
    for (auto heapName : GfxPartition::heapNonSvmNames) {
        auto &heap = getHeap(heapName);
        if ((ptr > heap.getBase()) && ((ptr + size) < heap.getLimit())) {
            heap.free(ptr, size);
            break;
        }
    }
}

uint64_t GfxPartition::getHeapMinimalAddress(HeapIndex heapIndex) {
    if (heapIndex == HeapIndex::heapSvm ||
        heapIndex == HeapIndex::heapExternalDeviceFrontWindow ||
        heapIndex == HeapIndex::heapExternalFrontWindow ||
        heapIndex == HeapIndex::heapInternalDeviceFrontWindow ||
        heapIndex == HeapIndex::heapInternalFrontWindow) {
        return getHeapBase(heapIndex);
    } else {
        if ((heapIndex == HeapIndex::heapExternal ||
             heapIndex == HeapIndex::heapExternalDeviceMemory) &&
            (getHeapLimit(HeapAssigner::mapExternalWindowIndex(heapIndex)) != 0)) {
            return getHeapBase(heapIndex) + GfxPartition::externalFrontWindowPoolSize;
        } else if (heapIndex == HeapIndex::heapInternal ||
                   heapIndex == HeapIndex::heapInternalDeviceMemory) {
            return getHeapBase(heapIndex) + GfxPartition::internalFrontWindowPoolSize;
        } else if (heapIndex == HeapIndex::heapStandard2MB) {
            return getHeapBase(heapIndex) + GfxPartition::heapGranularity2MB;
        }
        return getHeapBase(heapIndex) + GfxPartition::heapGranularity;
    }
}

bool GfxPartition::init(uint64_t gpuAddressSpace, size_t cpuAddressRangeSizeToReserve, uint32_t rootDeviceIndex, size_t numRootDevices, bool useExternalFrontWindowPool, uint64_t systemMemorySize, uint64_t gfxTop) {

    /*
     * I. 64-bit builds:
     *
     *   1) 48-bit Full Range SVM gfx layout:
     *
     *                   SVM                  H0   H1   H2   H3      STANDARD      STANDARD64K
     *   |__________________________________|____|____|____|____|________________|______________|
     *   |                                  |    |    |    |    |                |              |
     *   |                                gfxBase                                             gfxTop
     *  0x0                          0x0000800000000000                               0x0000FFFFFFFFFFFF
     *
     *
     *   2) 47-bit Full Range SVM gfx layout:
     *
     *                             gfxSize = 2^47 / 4 = 0x200000000000
     *                      ________________________________________________
     *                     /                                                \
     *           SVM      / H0   H1   H2   H3      STANDARD      STANDARD64K \      SVM
     *   |________________|____|____|____|____|________________|______________|_______________|
     *   |                |    |    |    |    |                |              |               |
     *   |              gfxBase                                             gfxTop            |
     *  0x0    reserveCpuAddressRange(gfxSize)                                      0x00007FFFFFFFFFFF
     *   \_____________________________________ SVM _________________________________________/
     *
     *
     *
     *   3) Limited Range gfx layout (no SVM):
     *
     *     H0   H1   H2   H3        STANDARD          STANDARD64K
     *   |____|____|____|____|____________________|__________________|
     *   |    |    |    |    |                    |                  |
     * gfxBase                                                    gfxTop
     *  0x0                                                    0xFFF...FFF < 47 bit
     *
     *
     * II. 32-bit builds:
     *
     *   1) 32-bit Full Range SVM gfx layout:
     *
     *      SVM    H0   H1   H2   H3      STANDARD      STANDARD64K
     *   |_______|____|____|____|____|________________|______________|
     *   |       |    |    |    |    |                |              |
     *   |     gfxBase                                             gfxTop
     *  0x0  0x100000000                                       gpuAddressSpace
     */

    uint64_t gfxBase = 0x0ull;
    const uint64_t gfxHeap32Size = 4 * MemoryConstants::gigaByte;

    if (is32bit) {
        gfxBase = maxNBitValue(32) + 1;
        heapInit(HeapIndex::heapSvm, 0ull, gfxBase);
    } else {
        auto cpuVirtualAddressSize = CpuInfo::getInstance().getVirtualAddressSize();
        if (cpuVirtualAddressSize == 48 && gpuAddressSpace == maxNBitValue(48)) {
            gfxBase = maxNBitValue(48 - 1) + 1;
            heapInit(HeapIndex::heapSvm, 0ull, gfxBase);
        } else if (gpuAddressSpace == maxNBitValue(47)) {
            if (reservedCpuAddressRangeForHeapSvm.alignedPtr == nullptr) {
                if (cpuAddressRangeSizeToReserve == 0) {
                    return false;
                }
                reservedCpuAddressRangeForHeapSvm = osMemory->reserveCpuAddressRange(cpuAddressRangeSizeToReserve, GfxPartition::heapGranularity);
                if (reservedCpuAddressRangeForHeapSvm.originalPtr == nullptr) {
                    return false;
                }
                if (!isAligned<GfxPartition::heapGranularity>(reservedCpuAddressRangeForHeapSvm.alignedPtr)) {
                    return false;
                }
            }
            gfxBase = reinterpret_cast<uint64_t>(reservedCpuAddressRangeForHeapSvm.alignedPtr);
            gfxTop = gfxBase + cpuAddressRangeSizeToReserve;
            heapInit(HeapIndex::heapSvm, 0ull, gpuAddressSpace + 1);
        } else if (gpuAddressSpace < maxNBitValue(47)) {
            gfxBase = 0ull;
            heapInit(HeapIndex::heapSvm, 0ull, 0ull);
        } else {
            if (!initAdditionalRange(cpuVirtualAddressSize, gpuAddressSpace, gfxBase, gfxTop, rootDeviceIndex, numRootDevices, systemMemorySize)) {
                return false;
            }
        }
    }

    for (auto heap : GfxPartition::heap32Names) {
        if (useExternalFrontWindowPool && HeapAssigner::heapTypeExternalWithFrontWindowPool(heap)) {
            heapInitExternalWithFrontWindow(heap, gfxBase, gfxHeap32Size);
            size_t externalFrontWindowSize = GfxPartition::externalFrontWindowPoolSize;
            auto allocation = heapAllocate(heap, externalFrontWindowSize);
            heapInitExternalWithFrontWindow(HeapAssigner::mapExternalWindowIndex(heap), allocation,
                                            externalFrontWindowSize);
        } else if (HeapAssigner::isInternalHeap(heap)) {
            heapInitWithFrontWindow(heap, gfxBase, gfxHeap32Size, GfxPartition::internalFrontWindowPoolSize);
            heapInitFrontWindow(HeapAssigner::mapInternalWindowIndex(heap), gfxBase, GfxPartition::internalFrontWindowPoolSize);
        } else {
            heapInit(heap, gfxBase, gfxHeap32Size);
        }
        gfxBase += gfxHeap32Size;
    }

    constexpr uint32_t numStandardHeaps = static_cast<uint32_t>(HeapIndex::heapStandard2MB) - static_cast<uint32_t>(HeapIndex::heapStandard) + 1;
    constexpr uint64_t maxStandardHeapGranularity = std::max(GfxPartition::heapGranularity, GfxPartition::heapGranularity2MB);

    gfxBase = alignUp(gfxBase, maxStandardHeapGranularity);
    uint64_t maxStandardHeapSize = alignDown((gfxTop - gfxBase) / numStandardHeaps, maxStandardHeapGranularity);

    auto gfxStandardSize = maxStandardHeapSize;
    heapInit(HeapIndex::heapStandard, gfxBase, gfxStandardSize);
    DEBUG_BREAK_IF(!isAligned<GfxPartition::heapGranularity>(getHeapBase(HeapIndex::heapStandard)));

    gfxBase += maxStandardHeapSize;

    // Split HEAP_STANDARD64K among root devices
    auto gfxStandard64KBSize = alignDown(maxStandardHeapSize / numRootDevices, GfxPartition::heapGranularity);
    heapInitWithAllocationAlignment(HeapIndex::heapStandard64KB, gfxBase + rootDeviceIndex * gfxStandard64KBSize, gfxStandard64KBSize, MemoryConstants::pageSize64k);
    DEBUG_BREAK_IF(!isAligned<GfxPartition::heapGranularity>(getHeapBase(HeapIndex::heapStandard64KB)));

    gfxBase += maxStandardHeapSize;

    // Split HEAP_STANDARD2MB among root devices
    auto gfxStandard2MBSize = alignDown(maxStandardHeapSize / numRootDevices, GfxPartition::heapGranularity2MB);
    heapInitWithAllocationAlignment(HeapIndex::heapStandard2MB, gfxBase + rootDeviceIndex * gfxStandard2MBSize, gfxStandard2MBSize, 2 * MemoryConstants::megaByte);
    DEBUG_BREAK_IF(!isAligned<GfxPartition::heapGranularity2MB>(getHeapBase(HeapIndex::heapStandard2MB)));

    return true;
}

bool GfxPartition::initAdditionalRange(uint32_t cpuVirtualAddressSize, uint64_t gpuAddressSpace, uint64_t &gfxBase, uint64_t &gfxTop, uint32_t rootDeviceIndex, size_t numRootDevices, uint64_t systemMemorySize) {
    /*
     * 57-bit Full Range SVM gfx layout:
     *
     *                                gfxSize = 256GB(48b)/1TB(57b)                                 2^48 = 0x1_0000_0000_0000          (Not Used Now)
     *                      ________________________________________________                     _______________________________     ___________________
     *                     /                                                \                   /                               \   /                   \
     *           SVM      / H0   H1   H2   H3      STANDARD      STANDARD64K \      SVM        /          HEAP_EXTENDED          \ /                     \
     *   |________________|____|____|____|____|________________|______________|_______________|___________________________________|______________ ..... __|
     *   |                |    |    |    |    |                |              |               |                                   |                       |
     *   |              gfxBase                                     gfxTop < 0xFFFFFFFFFFFF   |                                   |                       |
     *  0x0  reserveCpuAddressRange(gfxSize) < 0xFFFFFFFFFFFF - gfxSize             0x100_0000_0000_0000(57b)           0x100_FFFF_FFFF_FFFF    0x1FF_FFFF_FFFF_FFFF
     *   \_____________________________________ SVM _________________________________________/
     *
     */

    // We are here means either CPU VA or GPU VA or both are 57 bit
    if (cpuVirtualAddressSize != 57 && cpuVirtualAddressSize != 48) {
        return false;
    }

    if (gpuAddressSpace != maxNBitValue(57) && gpuAddressSpace != maxNBitValue(48)) {
        return false;
    }

    if (cpuVirtualAddressSize == 57 && CpuInfo::getInstance().isCpuFlagPresent("la57")) {
        // Always reserve 48 bit window on 57 bit CPU
        if (reservedCpuAddressRangeForHeapSvm.alignedPtr == nullptr) {
            reserveHigh48BitRangeWithMemoryMapsParse(osMemory.get(), reservedCpuAddressRangeForHeapSvm);

            if (reservedCpuAddressRangeForHeapSvm.alignedPtr == nullptr) {
                reserveLow48BitRangeWithRetry(osMemory.get(), reservedCpuAddressRangeForHeapSvm);
            }

            if (reservedCpuAddressRangeForHeapSvm.alignedPtr == nullptr) {
                return false;
            }
        }

        gfxBase = castToUint64(reservedCpuAddressRangeForHeapSvm.alignedPtr);
        gfxTop = gfxBase + reservedCpuAddressRangeForHeapSvm.sizeToReserve;
        if (gpuAddressSpace == maxNBitValue(57)) {
            heapInit(HeapIndex::heapSvm, 0ull, maxNBitValue(57 - 1) + 1);
        } else {
            heapInit(HeapIndex::heapSvm, 0ull, maxNBitValue(48) + 1);
        }

        if (gpuAddressSpace == maxNBitValue(57)) {
            uint64_t heapExtendedSize = 4 * systemMemorySize;
            reserve57BitRangeWithMemoryMapsParse(osMemory.get(), reservedCpuAddressRangeForHeapExtended, heapExtendedSize);
            if (reservedCpuAddressRangeForHeapExtended.alignedPtr) {
                heapInit(HeapIndex::heapExtendedHost, castToUint64(reservedCpuAddressRangeForHeapExtended.alignedPtr), heapExtendedSize);
            }
        }
    } else {
        // On 48 bit CPU this range is reserved for OS usage, do not reserve
        gfxBase = maxNBitValue(48 - 1) + 1; // 0x800000000000
        gfxTop = maxNBitValue(48) + 1;      // 0x1000000000000
        heapInit(HeapIndex::heapSvm, 0ull, gfxBase);
    }

    // Init HEAP_EXTENDED only for 57 bit GPU
    if (gpuAddressSpace == maxNBitValue(57)) {
        // Split HEAP_EXTENDED among root devices (like HEAP_STANDARD64K)
        auto heapExtendedSize = alignDown((maxNBitValue(48) + 1) / numRootDevices, GfxPartition::heapGranularity);
        heapInit(HeapIndex::heapExtended, maxNBitValue(57 - 1) + 1 + rootDeviceIndex * heapExtendedSize, heapExtendedSize);
    }

    return true;
}

} // namespace NEO
