File: cm_queue_rt.cpp

package info (click to toggle)
intel-media-driver 18.4.1%2Bdfsg1-1
links: PTS, VCS
area: main
in suites: buster
size: 77,144 kB
sloc: cpp: 784,288; ansic: 95,944; asm: 42,125; python: 353; sh: 156; makefile: 15
file content (3583 lines) | stat: -rw-r--r-- 140,846 bytes
parent folder | download | duplicates (2)
/*
* Copyright (c) 2007-2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
//!
//! \file      cm_queue_rt.cpp
//! \brief     Contains CmQueueRT implementations.
//!

#include "cm_queue_rt.h"

#include "cm_mem.h"
#include "cm_device_rt.h"
#include "cm_event_rt.h"
#include "cm_task_rt.h"
#include "cm_task_internal.h"
#include "cm_thread_space_rt.h"
#include "cm_kernel_rt.h"
#include "cm_kernel_data.h"
#include "cm_buffer_rt.h"
#include "cm_group_space.h"
#include "cm_vebox_data.h"
#include "cm_surface_manager.h"
#include "cm_surface_2d_rt.h"
#include "cm_vebox_rt.h"
#include "cm_execution_adv.h"

// Used by GPUCopy
#define BLOCK_PIXEL_WIDTH            (32)
#define BLOCK_HEIGHT                 (8)
#define BLOCK_HEIGHT_NV12            (4)
#define SUB_BLOCK_PIXEL_WIDTH        (8)
#define SUB_BLOCK_HEIGHT             (8)
#define SUB_BLOCK_HEIGHT_NV12        (4)
#define INNER_LOOP                   (4)
#define BYTE_COPY_ONE_THREAD         (1024*INNER_LOOP)  //4K for each thread
#define THREAD_SPACE_WIDTH_INCREMENT (8)
//Used by unaligned copy
#define BLOCK_WIDTH                  (64)
#define PAGE_ALIGNED                 (0x1000)

#define GPUCOPY_KERNEL_LOCK(a) ((a)->locked = true)
#define GPUCOPY_KERNEL_UNLOCK(a) ((a)->locked = false)

namespace CMRT_UMD
{
//*-----------------------------------------------------------------------------
//| Purpose:    Create Queue
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::Create(CmDeviceRT *device,
                          CmQueueRT* &queue,
                          CM_QUEUE_CREATE_OPTION queueCreateOption)
{
    int32_t result = CM_SUCCESS;
    queue = new (std::nothrow) CmQueueRT(device, queueCreateOption);
    if( queue )
    {
        result = queue->Initialize( );
        if( result != CM_SUCCESS )
        {
            CmQueueRT::Destroy( queue);
        }
    }
    else
    {
        CM_ASSERTMESSAGE("Error: Failed to create CmQueue due to out of system memory.");
        result = CM_OUT_OF_HOST_MEMORY;
    }
    return result;
}

//*-----------------------------------------------------------------------------
//| Purpose:    Destroy Queue
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::Destroy(CmQueueRT* &queue )
{
    if( queue == nullptr )
    {
        return CM_FAILURE;
    }

    uint32_t result = queue->CleanQueue();
    CmSafeDelete( queue );

    return result;
}

//*-----------------------------------------------------------------------------
//| Purpose:    Constructor of Cm Queue
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
CmQueueRT::CmQueueRT(CmDeviceRT *device,
                     CM_QUEUE_CREATE_OPTION queueCreateOption):
    m_device(device),
    m_eventArray(CM_INIT_EVENT_COUNT),
    m_eventCount(0),
    m_halMaxValues(nullptr),
    m_copyKernelParamArray(CM_INIT_GPUCOPY_KERNL_COUNT),
    m_copyKernelParamArrayCount(0),
    m_queueOption(queueCreateOption)
{

}

//*-----------------------------------------------------------------------------
//| Purpose:    Destructor of Cm Queue
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
CmQueueRT::~CmQueueRT()
{
    uint32_t eventArrayUsedSize = m_eventArray.GetMaxSize();
    for( uint32_t i = 0; i < eventArrayUsedSize; i ++ )
    {
        CmEventRT* event = (CmEventRT*)m_eventArray.GetElement( i );
        uint32_t eventReleaseTimes = 0;
        while( event )
        {   // destroy the event no matter if it is released by user
            if(eventReleaseTimes > 2)
            {
                // The max of event's reference cout is 2
                // if the event is not released after 2 times, there is something wrong
                CM_ASSERTMESSAGE("Error: The max of event's reference cout is 2.");
                break;
            }
            CmEventRT::Destroy( event );
            eventReleaseTimes ++;
        }
    }
    m_eventArray.Delete();

    // Do not destroy the kernel in m_copyKernelParamArray.
    // They have been destoyed in ~CmDevice() before destroying Queue
    for( uint32_t i = 0; i < m_copyKernelParamArrayCount; i ++ )
    {
        CM_GPUCOPY_KERNEL *gpuCopyParam = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement( i );
        CmSafeDelete(gpuCopyParam);
    }

    m_copyKernelParamArray.Delete();

}

//*-----------------------------------------------------------------------------
//| Purpose:    Initialize Cm Queue
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::Initialize()
{
    PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
    CM_HAL_MAX_VALUES_EX* halMaxValuesEx = nullptr;
    CM_RETURN_CODE hr = CM_SUCCESS;
    m_device->GetHalMaxValues(m_halMaxValues, halMaxValuesEx);

    // Creates or gets GPU Context for the test
    if (m_queueOption.UserGPUContext == true)
    {
        // Checks if it is the user-provided GPU context. If it is valid, we will create the queue with the existing Context
        if (cmHalState->osInterface->pfnIsGpuContextValid(cmHalState->osInterface, (MOS_GPU_CONTEXT)m_queueOption.GPUContext) != MOS_STATUS_SUCCESS)
        {
            // Returns failure
            CM_ASSERTMESSAGE("Error: The user passed in an GPU context which is not valid");
            return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX;
        }
    }
    else
    {
        MOS_GPUCTX_CREATOPTIONS ctxCreateOption;
        ctxCreateOption.CmdBufferNumScale = cmHalState->cmDeviceParam.maxTasks;

        // Create MDF preset GPU context, update GPUContext in m_queueOption
        if (m_queueOption.QueueType == CM_QUEUE_TYPE_RENDER)
        {
            // command buffer number
            ctxCreateOption.CmdBufferNumScale = HalCm_GetNumCmdBuffers(cmHalState->osInterface, cmHalState->cmDeviceParam.maxTasks);

            MOS_GPU_CONTEXT tmpGpuCtx = cmHalState->requestCustomGpuContext? MOS_GPU_CONTEXT_RENDER4: MOS_GPU_CONTEXT_RENDER3;;

            // check if context handle was specified by user.
            if (m_queueOption.GPUContext != 0)
            {
                tmpGpuCtx = (MOS_GPU_CONTEXT)m_queueOption.GPUContext;
            }

            // sanity check of context handle for CM
            if (HalCm_IsValidGpuContext(tmpGpuCtx) == false)
            {
                return CM_INVALID_USER_GPU_CONTEXT_FOR_QUEUE_EX;
            }

            // SSEU overriding
            if (cmHalState->cmHalInterface->IsOverridePowerOptionPerGpuContext())
            {
                // checking if need shutdown sub-slices for VME usage
                if (m_queueOption.SseuUsageHint == CM_QUEUE_SSEU_USAGE_HINT_VME
                 && cmHalState->cmHalInterface->IsRequestShutdownSubslicesForVmeUsage())
                {
                    MEDIA_SYSTEM_INFO *gtSystemInfo = cmHalState->osInterface->pfnGetGtSystemInfo(cmHalState->osInterface);
                    ctxCreateOption.packed.SliceCount    = (uint8_t)gtSystemInfo->SliceCount;
                    ctxCreateOption.packed.SubSliceCount = (gtSystemInfo->SubSliceCount / gtSystemInfo->SliceCount) >> 1; // set to half
                    ctxCreateOption.packed.MaxEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount;
                    ctxCreateOption.packed.MinEUcountPerSubSlice = gtSystemInfo->EUCount/gtSystemInfo->SubSliceCount;
                }

#if (_DEBUG || _RELEASE_INTERNAL)
                MOS_USER_FEATURE_VALUE_DATA UserFeatureData = {0};
                MOS_UserFeature_ReadValue_ID(
                    nullptr,
                    __MEDIA_USER_FEATURE_VALUE_SSEU_SETTING_OVERRIDE_ID,
                    &UserFeatureData);

                // +---------------+----------------+----------------+----------------+
                // |   EUCountMax  |   EUCountMin   |     SSCount    |   SliceCount   |
                // +-------------24+--------------16+---------------8+---------------0+
                if (UserFeatureData.u32Data != 0xDEADC0DE)
                {
                    ctxCreateOption.packed.SliceCount            = UserFeatureData.u32Data         & 0xFF;       // Bits 0-7
                    ctxCreateOption.packed.SubSliceCount         = (UserFeatureData.u32Data >>  8) & 0xFF;       // Bits 8-15
                    ctxCreateOption.packed.MaxEUcountPerSubSlice = (UserFeatureData.u32Data >> 16) & 0xFF;       // Bits 16-23
                    ctxCreateOption.packed.MinEUcountPerSubSlice = (UserFeatureData.u32Data >> 24) & 0xFF;       // Bits 24-31
                }
#endif
            }

            ctxCreateOption.runAloneMode = m_queueOption.RunAloneMode;

            // Create Render GPU Context
            CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->pfnCreateGPUContext(cmHalState, tmpGpuCtx, MOS_GPU_NODE_3D, &ctxCreateOption));

            // Set current GPU context
            CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->osInterface->pfnSetGpuContext(cmHalState->osInterface, tmpGpuCtx));

#if (_RELEASE_INTERNAL || _DEBUG)
#if defined(CM_DIRECT_GUC_SUPPORT)
            //init GuC
            CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmHalState->osInterface->pfnInitGuC(cmHalState->osInterface, MOS_GPU_NODE_3D));
#endif
#endif
            m_queueOption.GPUContext = tmpGpuCtx;
        }
        else if (m_queueOption.QueueType == CM_QUEUE_TYPE_COMPUTE)
        {
            ctxCreateOption.runAloneMode = m_queueOption.RunAloneMode;
            CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
                cmHalState->pfnCreateGPUContext(cmHalState, MOS_GPU_CONTEXT_CM_COMPUTE,
                                                MOS_GPU_NODE_COMPUTE, &ctxCreateOption));

            CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(
                cmHalState->osInterface->pfnSetGpuContext(cmHalState->osInterface,
                                                          MOS_GPU_CONTEXT_CM_COMPUTE));

            m_queueOption.GPUContext = MOS_GPU_CONTEXT_CM_COMPUTE;
        }
        else
        {
            // Returns failure
            CM_ASSERTMESSAGE("Error: The QueueType is not supported by MDF.");
            return CM_NOT_IMPLEMENTED;
        }
    }

finish:
    return hr;
}

//*-----------------------------------------------------------------------------
//| Purpose:    Checks whether any kernels in the task have a thread argument
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::GetTaskHasThreadArg(CmKernelRT* kernelArray[], uint32_t numKernels, bool& threadArgExists)
{
    threadArgExists = false;

    for(uint32_t krn = 0; krn < numKernels; krn++)
    {
        if( !kernelArray[krn] )
        {
            CM_ASSERTMESSAGE("Error: The kernel in the task have no thread argument.");
            return CM_FAILURE;
        }

        if( kernelArray[krn]->IsThreadArgExisted( ) )
        {
            threadArgExists = true;
            break;
        }
    }

    return CM_SUCCESS;
}

//*-----------------------------------------------------------------------------
//| Purpose:    Enqueue Task
//| Arguments :
//|               kernelArray      [in]       Pointer to kernel array
//|               event            [in]       Reference to the pointer to Event
//|               threadSpace               [out]      Pointer to thread space
//|
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::Enqueue(
           CmTask* kernelArray,
           CmEvent* & event,
           const CmThreadSpace* threadSpace)
{
    INSERT_API_CALL_LOG();

    if (kernelArray == nullptr)
    {
        CM_ASSERTMESSAGE("Error: Kernel array is null.");
        return CM_INVALID_ARG_VALUE;
    }

    CmTaskRT *kernelArrayRT = static_cast<CmTaskRT *>(kernelArray);
    uint32_t kernelCount = 0;
    kernelCount = kernelArrayRT->GetKernelCount();
    if (kernelCount == 0)
    {
        CM_ASSERTMESSAGE("Error: Invalid kernel count.");
        return CM_FAILURE;
    }

    if (kernelCount > m_halMaxValues->maxKernelsPerTask)
    {
        CM_ASSERTMESSAGE("Error: Kernel count exceeds max kernel per enqueue.");
        return CM_EXCEED_MAX_KERNEL_PER_ENQUEUE;
    }

    int32_t result;
    const CmThreadSpaceRT *threadSpaceRTConst = static_cast<const CmThreadSpaceRT *>(threadSpace);
    PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
    if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
    {
        if (threadSpaceRTConst != nullptr)
        {
            result = EnqueueWithGroup(kernelArray, event, threadSpaceRTConst->GetThreadGroupSpace());
        }
        else
        {
            // If there isn't any shared thread space or associated thread space,
            // create a temporary (maxThreadCount x 1) thread group space whose
            // size equal to the max thread count of kernel who doesn't have a
            // thread space associated.
            uint32_t maxThreadCount = 1;
            bool usedCommonTGS = false;
            for (uint32_t i = 0; i < kernelCount; i++)
            {
                CmKernelRT *tmpKernel = kernelArrayRT->GetKernelPointer(i);
                CmThreadGroupSpace *tmpTGS = nullptr;
                tmpKernel->GetThreadGroupSpace(tmpTGS);

                if (tmpTGS == nullptr)
                {
                    usedCommonTGS = true;
                    uint32_t singleThreadCount = 0;
                    tmpKernel->GetThreadCount(singleThreadCount);
                    if (maxThreadCount < singleThreadCount)
                    {
                        maxThreadCount = singleThreadCount;
                    }
                }
            }

            CmThreadGroupSpace *threadGroupSpaceTemp = nullptr;
            if (usedCommonTGS == true)
            {
                result = m_device->CreateThreadGroupSpace(1, 1, maxThreadCount, 1, threadGroupSpaceTemp);
                if (result != CM_SUCCESS)
                {
                    CM_ASSERTMESSAGE("Error: Creating temporary thread group space failure.");
                    return result;
                }
            }

            result = EnqueueWithGroup(kernelArray, event, threadGroupSpaceTemp);

            if (threadGroupSpaceTemp != nullptr)
            {
                m_device->DestroyThreadGroupSpace(threadGroupSpaceTemp);
            }
        }
        return result;
    }

    if (threadSpaceRTConst && threadSpaceRTConst->IsThreadAssociated())
    {
        if (threadSpaceRTConst->GetNeedSetKernelPointer() && threadSpaceRTConst->KernelPointerIsNULL())
        {
            CmKernelRT* tmp = nullptr;
            tmp = kernelArrayRT->GetKernelPointer(0);
            threadSpaceRTConst->SetKernelPointer(tmp);
        }
    }

#if _DEBUG
    if (threadSpaceRTConst)
    {
        CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst);
        if (!threadSpaceRT->IntegrityCheck(kernelArrayRT))
        {
            CM_ASSERTMESSAGE("Error: Invalid thread space.");
            return CM_INVALID_THREAD_SPACE;
        }
    }
#endif

    if(m_device->IsPrintEnable())
    {
        m_device->ClearPrintBuffer();
    }

    typedef CmKernelRT* pCmKernel;
    CmKernelRT** tmp = MOS_NewArray(pCmKernel, (kernelCount + 1));
    if(tmp == nullptr)
    {
        CM_ASSERTMESSAGE("Error: Out of system memory.");
        return CM_OUT_OF_HOST_MEMORY;
    }

    uint32_t totalThreadNumber = 0;
    for(uint32_t i = 0; i < kernelCount; i++)
    {
        tmp[ i ] = kernelArrayRT->GetKernelPointer(i);

        uint32_t singleThreadNumber = 0;
        tmp[i]->GetThreadCount(singleThreadNumber);
        if (singleThreadNumber == 0)
        {
            CmThreadSpaceRT *threadSpaceRT = const_cast<CmThreadSpaceRT*>(threadSpaceRTConst);
            if (threadSpaceRT)
            {
                uint32_t width, height;
                threadSpaceRT->GetThreadSpaceSize(width, height);
                singleThreadNumber = width*height;
            }
        }
        totalThreadNumber += singleThreadNumber;
    }
    tmp[kernelCount ] = nullptr;

    CmEventRT *eventRT = static_cast<CmEventRT *>(event);
    result = Enqueue_RT(tmp, kernelCount, totalThreadNumber, eventRT, threadSpaceRTConst, kernelArrayRT->GetSyncBitmap(), kernelArrayRT->GetPowerOption(),
                        kernelArrayRT->GetConditionalEndBitmap(), kernelArrayRT->GetConditionalEndInfo(), kernelArrayRT->GetTaskConfig());

    if (eventRT)
    {
        eventRT->SetKernelNames(kernelArrayRT, const_cast<CmThreadSpaceRT*>(threadSpaceRTConst), nullptr);
    }

    event = eventRT;
    MosSafeDeleteArray( tmp );

    return result;
}

//*-----------------------------------------------------------------------------
//| Purpose:      Enqueue Task
//| Arguments :
//|               kernelArray      [in]       Pointer to kernel array
//|               event            [in]       Reference to the pointer to Event
//|               threadSpace               [out]      Pointer to thread space
//|
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::Enqueue_RT(
                        CmKernelRT* kernelArray[],
                        const uint32_t kernelCount,
                        const uint32_t totalThreadCount,
                        CmEventRT* & event,
                        const CmThreadSpaceRT* threadSpace,
                        uint64_t    syncBitmap,
                        PCM_POWER_OPTION powerOption,
                        uint64_t    conditionalEndBitmap,
                        CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo,
                        PCM_TASK_CONFIG  taskConfig)
{
    if(kernelArray == nullptr)
    {
        CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
        return CM_INVALID_ARG_VALUE;
    }

    if( kernelCount == 0 )
    {
        CM_ASSERTMESSAGE("Error: There are no valid kernels.");
        return CM_INVALID_ARG_VALUE;
    }

    bool isEventVisible = (event == CM_NO_EVENT)? false:true;

    CLock Locker(m_criticalSectionTaskInternal);
    CmTaskInternal* task = nullptr;
    int32_t result = CmTaskInternal::Create(kernelCount, totalThreadCount, kernelArray, threadSpace, m_device, syncBitmap, task, conditionalEndBitmap, conditionalEndInfo);
    if( result != CM_SUCCESS )
    {
        CM_ASSERTMESSAGE("Error: Create CM task internal failure.");
        return result;
    }

    LARGE_INTEGER nEnqueueTime;
    if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )))
    {
        CM_ASSERTMESSAGE("Error: Query performance counter failure.");
        return CM_FAILURE;
    }

    int32_t taskDriverId = -1;

    result = CreateEvent(task, isEventVisible, taskDriverId, event);
    if (result != CM_SUCCESS)
    {
        CM_ASSERTMESSAGE("Error: Create event failure.");
        return result;
    }
    if ( event != nullptr )
    {
        event->SetEnqueueTime( nEnqueueTime );
    }

    task->SetPowerOption( powerOption );

    task->SetProperty(taskConfig);

    if( !m_enqueuedTasks.Push( task ) )
    {
        CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.");
        return CM_FAILURE;
    }

    result = FlushTaskWithoutSync();

    return result;
}

int32_t CmQueueRT::Enqueue_RT(CmKernelRT* kernelArray[],
                        const uint32_t kernelCount,
                        const uint32_t totalThreadCount,
                        CmEventRT* & event,
                        const CmThreadGroupSpace* threadGroupSpace,
                        uint64_t    syncBitmap,
                        PCM_POWER_OPTION powerOption,
                        uint64_t    conditionalEndBitmap,
                        CM_HAL_CONDITIONAL_BB_END_INFO* conditionalEndInfo,
                        PCM_TASK_CONFIG  taskConfig,
                        const CM_EXECUTION_CONFIG* krnExecCfg)
{
    if(kernelArray == nullptr)
    {
        CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
        return CM_INVALID_ARG_VALUE;
    }

    if( kernelCount == 0 )
    {
        CM_ASSERTMESSAGE("Error: There are no valid kernels.");
        return CM_INVALID_ARG_VALUE;
    }

    CLock Locker(m_criticalSectionTaskInternal);

    CmTaskInternal* task = nullptr;
    int32_t result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray,
                                            threadGroupSpace, m_device, syncBitmap, task,
                                            conditionalEndBitmap, conditionalEndInfo, krnExecCfg);
    if( result != CM_SUCCESS )
    {
        CM_ASSERTMESSAGE("Error: Create CmTaskInternal failure.");
        return result;
    }

    LARGE_INTEGER nEnqueueTime;
    if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )))
    {
        CM_ASSERTMESSAGE("Error: Query performance counter failure.");
        return CM_FAILURE;
    }

    int32_t taskDriverId = -1;
    result = CreateEvent(task, !(event == CM_NO_EVENT) , taskDriverId, event);
    if (result != CM_SUCCESS)
    {
        CM_ASSERTMESSAGE("Error: Create event failure.");
        return result;
    }
    if ( event != nullptr )
    {
        event->SetEnqueueTime( nEnqueueTime );
    }

    task->SetPowerOption( powerOption );

    task->SetProperty(taskConfig);

    if( !m_enqueuedTasks.Push( task ) )
    {
        CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
        return CM_FAILURE;
    }

    result = FlushTaskWithoutSync();

    return result;
}

int32_t CmQueueRT::Enqueue_RT( CmKernelRT* kernelArray[],
                        CmEventRT* & event,
                        uint32_t numTasksGenerated,
                        bool isLastTask,
                        uint32_t hints,
                        PCM_POWER_OPTION powerOption)
{
    int32_t result = CM_FAILURE;
    uint32_t kernelCount = 0;
    CmTaskInternal* task = nullptr;
    int32_t taskDriverId = -1;
    bool isEventVisible = (event == CM_NO_EVENT) ? false:true;
    bool threadArgExists = false;

    if( kernelArray == nullptr)
    {
        CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
        return CM_INVALID_ARG_VALUE;
    }
    while( kernelArray[ kernelCount ] )
    {
        kernelCount++;
    }

    if( kernelCount < CM_MINIMUM_NUM_KERNELS_ENQWHINTS )
    {
        CM_ASSERTMESSAGE("Error: EnqueueWithHints requires at least 2 kernels.");
        return CM_FAILURE;
    }

    uint32_t totalThreadCount = 0;
    for( uint32_t i = 0; i < kernelCount; i ++ )
    {
        uint32_t threadCount = 0;
        kernelArray[i]->GetThreadCount( threadCount );
        totalThreadCount += threadCount;
    }

    if( GetTaskHasThreadArg(kernelArray, kernelCount, threadArgExists) != CM_SUCCESS )
    {
        CM_ASSERTMESSAGE("Error: Thread argument checking fails.");
        return CM_FAILURE;
    }

    if( !threadArgExists )
    {
        if (totalThreadCount > m_halMaxValues->maxUserThreadsPerTaskNoThreadArg )
        {
            CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded.");
            return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE;
        }
    }
    else
    {
        if( totalThreadCount > m_halMaxValues->maxUserThreadsPerTask )
        {
            CM_ASSERTMESSAGE("Error: Maximum number of threads per task exceeded.");
            return CM_EXCEED_MAX_THREAD_AMOUNT_PER_ENQUEUE;
        }
    }

    CLock Locker(m_criticalSectionTaskInternal);

    result = CmTaskInternal::Create( kernelCount, totalThreadCount, kernelArray, task, numTasksGenerated, isLastTask, hints, m_device );
    if( result != CM_SUCCESS )
    {
        CM_ASSERTMESSAGE("Error: Create CM task internal failure.");
        return result;
    }

    LARGE_INTEGER nEnqueueTime;
    if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) )
    {
        CM_ASSERTMESSAGE("Error: Query performance counter failure.");
        return CM_FAILURE;
    }

    result = CreateEvent(task, isEventVisible, taskDriverId, event);
    if (result != CM_SUCCESS)
    {
        CM_ASSERTMESSAGE("Error: Create event failure.");
        return result;
    }
    if ( event != nullptr )
    {
        event->SetEnqueueTime( nEnqueueTime );
    }

    for( uint32_t i = 0; i < kernelCount; ++i )
    {
        CmKernelRT* kernel = nullptr;
        task->GetKernel(i, kernel);
        if( kernel != nullptr )
        {
            kernel->SetAdjustedYCoord(0);
        }
    }

    task->SetPowerOption( powerOption );

    if (!m_enqueuedTasks.Push(task))
    {
        CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
        return CM_FAILURE;
    }

    result = FlushTaskWithoutSync();

    return result;
}

//*-----------------------------------------------------------------------------
//! Function to enqueue task with thread group space pointer
//! Arguments:
//!     1. Pointer to CmTask, which can only contain one kernel.
//!     2. Reference to the pointer to CmEvent that is to be returned
//!     3. Pointer to a CmThreadGroupSpace.
//! Return Value:
//!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated
//!     CM_OUT_OF_HOST_MEMORY if out of host memory
//!     CM_FAILURE otherwise
//! Notes:
//!     If the kernel has per thread arg, GPGPU object is to be used.
//!     If the kernel has no per thread  arg. GPGPU walker is used.
//*-----------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::EnqueueWithGroup( CmTask* task, CmEvent* & event, const CmThreadGroupSpace* threadGroupSpace)
{
    INSERT_API_CALL_LOG();

    int32_t result;

    if(task == nullptr)
    {
        CM_ASSERTMESSAGE("Error: Kernel array is NULL.");
        return CM_INVALID_ARG_VALUE;
    }

    CmTaskRT *taskRT = static_cast<CmTaskRT *>(task);
    uint32_t count = 0;
    count = taskRT->GetKernelCount();

    if( count == 0 )
    {
        CM_ASSERTMESSAGE("Error: There are no valid kernels.");
        return CM_FAILURE;
    }

    if(m_device->IsPrintEnable())
    {
        m_device->ClearPrintBuffer();
    }

    typedef CmKernelRT* pCmKernel;
    CmKernelRT** tmp = MOS_NewArray(pCmKernel, (count+1));
    if(tmp == nullptr)
    {
        CM_ASSERTMESSAGE("Error: Out of system memory.");
        return CM_OUT_OF_HOST_MEMORY;
    }

    uint32_t totalThreadNumber = 0;
    for(uint32_t i = 0; i < count; i++)
    {
        uint32_t singleThreadNumber = 0;
        tmp[ i ] = taskRT->GetKernelPointer(i);

        //Thread arguments is not allowed in GPGPU_WALKER path
        if(tmp[i]->IsThreadArgExisted())
        {
            CM_ASSERTMESSAGE("Error: No thread Args allowed when using group space");
            MosSafeDeleteArray(tmp);
            return CM_THREAD_ARG_NOT_ALLOWED;
        }

        tmp[i]->GetThreadCount(singleThreadNumber);
        totalThreadNumber += singleThreadNumber;
    }
    tmp[count ] = nullptr;

    CmEventRT *eventRT = static_cast<CmEventRT *>(event);
    result = Enqueue_RT( tmp, count, totalThreadNumber, eventRT,
                         threadGroupSpace, taskRT->GetSyncBitmap(),
                         taskRT->GetPowerOption(),
                         taskRT->GetConditionalEndBitmap(), taskRT->GetConditionalEndInfo(),
                         taskRT->GetTaskConfig(), taskRT->GetKernelExecuteConfig());

    if (eventRT)
    {
        eventRT->SetKernelNames(taskRT, nullptr, const_cast<CmThreadGroupSpace*>(threadGroupSpace));
    }

    event = eventRT;
    MosSafeDeleteArray( tmp );

    return result;
}

CM_RT_API int32_t CmQueueRT::EnqueueWithHints(
                                        CmTask* kernelArray,
                                        CmEvent* & event,
                                        uint32_t hints)
{
    INSERT_API_CALL_LOG();

    int32_t            hr                = CM_FAILURE;
    uint32_t           count             = 0;
    uint32_t           index             = 0;
    CmKernelRT**         kernels          = nullptr;
    uint32_t           numTasks          = 0;
    bool               splitTask         = false;
    bool               lastTask          = false;
    uint32_t           numTasksGenerated = 0;
    CmEventRT          *eventRT = static_cast<CmEventRT *>(event);

    if (kernelArray == nullptr)
    {
        return CM_INVALID_ARG_VALUE;
    }
    CmTaskRT         *kernelArrayRT   = static_cast<CmTaskRT *>(kernelArray);
    count = kernelArrayRT->GetKernelCount();
    if( count == 0 )
    {
        CM_ASSERTMESSAGE("Error: Invalid kernel count.");
        hr = CM_FAILURE;
        goto finish;
    }

    if( count > m_halMaxValues->maxKernelsPerTask )
    {
        CM_ASSERTMESSAGE("Error: Kernel count exceeds maximum kernel per enqueue.");
        hr = CM_EXCEED_MAX_KERNEL_PER_ENQUEUE;
        goto finish;
    }

    for (uint32_t i = 0; i < count; ++i)
    {
        CmKernelRT* kernelTmp = nullptr;
        CmThreadSpaceRT* threadSpaceTmp = nullptr;
        kernelTmp = kernelArrayRT->GetKernelPointer(i);
        CM_CHK_NULL_GOTOFINISH_CMERROR(kernelTmp);
        kernelTmp->GetThreadSpace(threadSpaceTmp);
        CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpaceTmp);
        if (threadSpaceTmp->GetNeedSetKernelPointer() && threadSpaceTmp->KernelPointerIsNULL())
        {
            threadSpaceTmp->SetKernelPointer(kernelTmp);
        }
    }

#if _DEBUG
    if( !kernelArrayRT->IntegrityCheckKernelThreadspace() )
    {
        CM_ASSERTMESSAGE("Error: Integrity check for kernel thread space failed.");
        hr = CM_KERNEL_THREADSPACE_INTEGRITY_FAILED;
        goto finish;
    }
#endif

    numTasks = ( hints & CM_HINTS_MASK_NUM_TASKS ) >> CM_HINTS_NUM_BITS_TASK_POS;
    if( numTasks > 1 )
    {
        splitTask = true;
    }

    if( m_device->IsPrintEnable() )
    {
        m_device->ClearPrintBuffer();
    }

    kernels = MOS_NewArray(CmKernelRT*, (count + 1));
    CM_CHK_NULL_GOTOFINISH_CMERROR(kernels);

    do
    {
        for (index = 0; index < count; ++index)
        {
            kernels[ index ] = kernelArrayRT->GetKernelPointer( index );
        }

        kernels[ count ] = nullptr;

        if(splitTask)
        {
            if( numTasksGenerated == (numTasks - 1 ) )
            {
                lastTask = true;
            }
        }
        else
        {
            lastTask = true;
        }

        CM_CHK_CMSTATUS_GOTOFINISH(Enqueue_RT( kernels, eventRT, numTasksGenerated, lastTask, hints, kernelArrayRT->GetPowerOption() ));
        event = eventRT;
        numTasksGenerated++;

    }while(numTasksGenerated < numTasks);

finish:
    MosSafeDeleteArray( kernels );

    return hr;
}

//*-----------------------------------------------------------------------------
//! Enqueue an task, which contains one pre-defined kernel to
//! copy from host memory to surface
//! This is a non-blocking call. i.e. it returns immediately without waiting for
//! GPU to finish the execution of the task.
//! A CmEvent is generated each time a task is enqueued. The CmEvent can
//! be used to check if the task finishs.
//! INPUT:
//!     1) Pointer to the CmSurface2D_RT as copy destination
//!     2) Pointer to the host memory as copy source
//!     3) Reference to the pointer to CMEvent
//!     4) A boolean value to indicate if or not to flush the queue after enqueue the task
//!        by default the boolean value is TRUE.
//! OUTPUT:
//!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
//!     CM_OUT_OF_HOST_MEMORY if out of host memery;
//!     CM_FAILURE otherwise.
//!     More error code is coming.
//*-----------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPU( CmSurface2D* surface, const unsigned char* sysMem, CmEvent* & event )
{
    INSERT_API_CALL_LOG();

    if (!m_device->HasGpuCopyKernel())
    {
        return CM_NOT_IMPLEMENTED;
    }

    CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
    return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, 0, 0, CM_FASTCOPY_CPU2GPU, CM_FASTCOPY_OPTION_NONBLOCKING, event);
}

//*-----------------------------------------------------------------------------
//! Enqueue an task, which contains one pre-defined kernel to
//! copy from surface to host memory
//! This is a non-blocking call. i.e. it returns immediately without waiting for
//! GPU to finish the execution of the task.
//! A CmEvent is generated each time a task is enqueued. The CmEvent can
//! be used to check if the task finishs.
//! INPUT:
//!     1) Pointer to the CmSurface2D_RT as copy source
//!     2) Pointer to the host memory as copy destination
//!     3) Reference to the pointer to CMEvent
//!     4) A boolean value to indicate if or not to flush the queue after enqueue the task
//!        by default the boolean value is TRUE.
//! OUTPUT:
//!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
//!     CM_OUT_OF_HOST_MEMORY if out of host memery;
//!     CM_FAILURE otherwise.
//!     More error code is coming.
//*-----------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPU( CmSurface2D* surface, unsigned char* sysMem, CmEvent* & event )
{
    INSERT_API_CALL_LOG();

    if (!m_device->HasGpuCopyKernel())
    {
        return CM_NOT_IMPLEMENTED;
    }

    CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
    return EnqueueCopyInternal(surfaceRT, sysMem, 0, 0, CM_FASTCOPY_GPU2CPU, CM_FASTCOPY_OPTION_NONBLOCKING, event);
}

int32_t CmQueueRT::EnqueueUnalignedCopyInternal( CmSurface2DRT* surface, unsigned char* sysMem, const uint32_t widthStride, const uint32_t heightStride, CM_GPUCOPY_DIRECTION direction)
{
    int32_t         hr                          = CM_SUCCESS;
    uint32_t        bufferupSize               = 0;
    uint32_t        dstAddShiftOffset           = 0;
    uint32_t        threadWidth                 = 0;
    uint32_t        threadHeight                = 0;
    uint32_t        threadNum                   = 0;
    uint32_t        auxiliaryBufferupSize     = 0;
    uint32_t        width                       = 0;
    uint32_t        height                      = 0;
    uint32_t        sizePerPixel                = 0;
    uint32_t        widthByte                  = 0;
    uint32_t        copyWidthByte             = 0;
    uint32_t        copyHeightRow             = 0;
    uint32_t        strideInBytes             = widthStride;
    uint32_t        heightStrideInRows       = heightStride;
    size_t          linearAddress              = (size_t)sysMem;
    size_t          linearAddressAligned       = 0;
    unsigned char*  hybridCopyAuxSysMem        = nullptr;

    CmBufferUP             *bufferUP                  = nullptr;
    CmKernel               *kernel                    = nullptr;
    CmBufferUP             *hybridCopyAuxBufferUP     = nullptr;
    SurfaceIndex           *bufferIndexCM             = nullptr;
    SurfaceIndex           *hybridCopyAuxIndexCM      = nullptr;
    SurfaceIndex           *surf2DIndexCM             = nullptr;
    CmThreadSpace          *threadSpace               = nullptr;
    CmTask                 *gpuCopyTask               = nullptr;
    CmProgram              *gpuCopyProgram            = nullptr;
    CmEvent                *event                     = nullptr;
    CM_STATUS              status;
    CM_SURFACE_FORMAT      format;

    if ( surface )
    {
        CM_CHK_CMSTATUS_GOTOFINISH( surface->GetSurfaceDesc(width, height, format, sizePerPixel));
    }
    else
    {
        return CM_FAILURE;
    }

    widthByte                  = width * sizePerPixel;
    // the actual copy region
    copyWidthByte             = MOS_MIN(strideInBytes, widthByte);
    copyHeightRow             = MOS_MIN(heightStrideInRows, height);

    if(linearAddress == 0)
    {
        CM_ASSERTMESSAGE("Error: Pointer to system memory is null.");
        return CM_INVALID_ARG_VALUE;
    }
    if( (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_WIDTH ) || ( copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT) )
    {  // each thread handles 64x8 block data. This API will fail if it exceeds the max thread space's size
        CM_ASSERTMESSAGE("Error: Invalid copy size.");
        return CM_INVALID_ARG_SIZE;
    }

    if (sizeof (void *) == 8 ) //64-bit
    {
        linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
    }
    else  //32-bit
    {
        linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
    }
    //Calculate  Left Shift offset
    dstAddShiftOffset               = (uint32_t)(linearAddress - linearAddressAligned);

    if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
    {
        bufferupSize = MOS_ALIGN_CEIL(strideInBytes * (heightStrideInRows + copyHeightRow * 1/2) + (uint32_t)dstAddShiftOffset , 64);
    }
    else
    {
        bufferupSize = MOS_ALIGN_CEIL(strideInBytes * heightStrideInRows  + (uint32_t)dstAddShiftOffset, 64);
    }

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferupSize, ( void * )linearAddressAligned, bufferUP));
    CM_CHK_CMSTATUS_GOTOFINISH(bufferUP->GetIndex(bufferIndexCM));
    CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM));

    CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram));
    CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram);

    if (direction == CM_FASTCOPY_CPU2GPU)
    {
        if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
        {
            CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned_NV12), kernel, "PredefinedGPUCopyKernel"));
        }
        else
        {
            CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_write_unaligned), kernel, "PredefinedGPUCopyKernel"));

        }
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM ));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM ));
    }
    else
    {
        if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
        {
            CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned_NV12), kernel, "PredefinedGPUCopyKernel"));
            auxiliaryBufferupSize = BLOCK_WIDTH * 2 * (heightStrideInRows + copyHeightRow * 1/2);
        }
        else
        {
            CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(surfaceCopy_read_unaligned), kernel, "PredefinedGPUCopyKernel"));
            auxiliaryBufferupSize = BLOCK_WIDTH * 2 * heightStrideInRows;
        }
        hybridCopyAuxSysMem = (unsigned char*)MOS_AlignedAllocMemory(auxiliaryBufferupSize, PAGE_ALIGNED);
        if(!hybridCopyAuxSysMem)
        {
            CM_ASSERTMESSAGE("Error: Out of system memory.");
            return CM_OUT_OF_HOST_MEMORY;
        }
        CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(auxiliaryBufferupSize, (void*)hybridCopyAuxSysMem, hybridCopyAuxBufferUP));
        CM_CHK_CMSTATUS_GOTOFINISH(hybridCopyAuxBufferUP->GetIndex(hybridCopyAuxIndexCM));

        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM ));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM ));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), &copyWidthByte ));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( SurfaceIndex ), hybridCopyAuxIndexCM ));
    }

    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInBytes ));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows ));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &dstAddShiftOffset ));

    threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_WIDTH );
    threadHeight = ( uint32_t )ceil( ( double )copyHeightRow/BLOCK_HEIGHT );

    threadNum = threadWidth * threadHeight;
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
    CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
    CM_CHK_CMSTATUS_GOTOFINISH(Enqueue(gpuCopyTask, event, threadSpace));

    if(event)
    {
        CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status));
        while(status != CM_STATUS_FINISHED)
        {
            if (status == CM_STATUS_RESET)
            {
                hr = CM_TASK_MEDIA_RESET;
                goto finish;
            }
            CM_CHK_CMSTATUS_GOTOFINISH(event->GetStatus(status));
        }
    }
    // CPU copy unaligned data
    if( direction == CM_FASTCOPY_GPU2CPU)
    {
        uint32_t readOffset = 0;
        uint32_t copyLines = 0;
        unsigned char* startBuffer = (unsigned char*)linearAddressAligned;

        copyLines = (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016) ? heightStrideInRows + MOS_MIN(heightStrideInRows, height) * 1 / 2 : heightStrideInRows;

        for(uint32_t i = 0; i < copyLines; ++i)
        {
            //copy begining of line
            size_t beginLineWriteOffset = strideInBytes * i + dstAddShiftOffset;
            uint32_t mod = ((uintptr_t)startBuffer + beginLineWriteOffset) < BLOCK_WIDTH ? ((uintptr_t)startBuffer + beginLineWriteOffset) : ((uintptr_t)startBuffer + beginLineWriteOffset) & (BLOCK_WIDTH - 1);
            uint32_t beginLineCopySize = (mod == 0) ? 0:(BLOCK_WIDTH - mod);
            //fix copy size for cases where the surface width is small
            if((beginLineCopySize > widthByte) || ( beginLineCopySize == 0 && widthByte < BLOCK_WIDTH ) )
            {
                beginLineCopySize = widthByte;
            }
            if(beginLineCopySize > 0)
            {
                CmSafeMemCopy((void *)( (unsigned char *)startBuffer + beginLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset), beginLineCopySize);
            }

            //copy end of line
            uint32_t alignedWrites = (copyWidthByte - beginLineCopySize) &~ (BLOCK_WIDTH - 1);
            uint32_t endLineWriteOffset = beginLineWriteOffset + alignedWrites + beginLineCopySize;
            uint32_t endLineCopySize = dstAddShiftOffset+ i * strideInBytes + copyWidthByte - endLineWriteOffset;
            if(endLineCopySize > 0 && endLineWriteOffset > beginLineWriteOffset)
            {
                CmSafeMemCopy((void *)((unsigned char *)startBuffer + endLineWriteOffset), (void *)(hybridCopyAuxSysMem + readOffset + BLOCK_WIDTH), endLineCopySize);
            }
            readOffset += (BLOCK_WIDTH * 2);
        }
    }

    CM_CHK_CMSTATUS_GOTOFINISH(DestroyEvent(event));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(bufferUP));
    if (direction == CM_FASTCOPY_GPU2CPU)
    {
        if(hybridCopyAuxBufferUP)
        {
            CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(hybridCopyAuxBufferUP));
        }
        if(hybridCopyAuxSysMem)
        {
            MOS_AlignedFreeMemory(hybridCopyAuxSysMem);
            hybridCopyAuxSysMem = nullptr;
        }
    }
finish:
    if(hr != CM_SUCCESS)
    {
        if(bufferUP == nullptr)
        {
            // user need to know whether the failure is caused by out of BufferUP.
            hr = CM_GPUCOPY_OUT_OF_RESOURCE;
        }

        if(event)                          DestroyEvent(event);
        if(kernel)                         m_device->DestroyKernel(kernel);
        if(threadSpace)                    m_device->DestroyThreadSpace(threadSpace);
        if(gpuCopyTask)                    m_device->DestroyTask(gpuCopyTask);
        if(bufferUP)                       m_device->DestroyBufferUP(bufferUP);
        if(hybridCopyAuxBufferUP)          m_device->DestroyBufferUP(hybridCopyAuxBufferUP);
        if(hybridCopyAuxSysMem)            {MOS_AlignedFreeMemory(hybridCopyAuxSysMem); hybridCopyAuxSysMem = nullptr;}
    }

    return hr;
}
//*-----------------------------------------------------------------------------
//! Enqueue an task, which contains one pre-defined kernel to
//! copy from surface to host memory or from host memory to surface
//! This is a non-blocking call. i.e. it returns immediately without waiting for
//! GPU to finish the execution of the task.
//! A CmEvent is generated each time a task is enqueued. The CmEvent can
//! be used to check if the task finishes.
//! INPUT:
//!     1) Pointer to the CmSurface2D
//!     2) Pointer to the host memory
//!     3) Width stride in bytes, if there is no padding in system memroy, it is set to zero.
//!     4) Height stride in row, if there is no padding in system memroy, it is set to zero.
//!     4) Copy direction, cpu->gpu (linear->tiled) or gpu->cpu(tiled->linear)
//!     5) Reference to the pointer to CMEvent
//! OUTPUT:
//!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
//!     CM_OUT_OF_HOST_MEMORY if out of host memery;
//!     CM_FAILURE otherwise.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::EnqueueCopyInternal(CmSurface2DRT* surface,
                                unsigned char* sysMem,
                                const uint32_t widthStride,
                                const uint32_t heightStride,
                                CM_GPUCOPY_DIRECTION direction,
                                const uint32_t option,
                                CmEvent* & event)
{
    int32_t hr                  = CM_FAILURE;
    uint32_t width               = 0;
    uint32_t height              = 0;
    uint32_t sizePerPixel        = 0;
    CM_SURFACE_FORMAT format    = CM_SURFACE_FORMAT_INVALID;

    if (surface)
    {
        CM_CHK_CMSTATUS_GOTOFINISH(surface->GetSurfaceDesc(width, height, format, sizePerPixel));
    }
    else
    {
        return CM_GPUCOPY_INVALID_SURFACES;
    }

    if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
    {
        hr = EnqueueCopyInternal_2Planes(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event);
    }
    else
    {
        hr = EnqueueCopyInternal_1Plane(surface, (unsigned char*)sysMem, format, width, widthStride, height, heightStride, sizePerPixel, direction, option, event);
    }

finish:
    return hr;
}

int32_t CmQueueRT::EnqueueCopyInternal_1Plane(CmSurface2DRT* surface,
                                    unsigned char* sysMem,
                                    CM_SURFACE_FORMAT format,
                                    const uint32_t widthInPixel,
                                    const uint32_t widthStride,
                                    const uint32_t heightInRow,
                                    const uint32_t heightStride,
                                    const uint32_t sizePerPixel,
                                    CM_GPUCOPY_DIRECTION direction,
                                    const uint32_t option,
                                    CmEvent* & event )
{
    int32_t         hr                      = CM_SUCCESS;
    uint32_t        tempHeight              = heightInRow;
    uint32_t        strideInBytes         = widthStride;
    uint32_t        strideInDwords        = 0;
    uint32_t        heightStrideInRows   = heightStride;
    uint32_t        addedShiftLeftOffset    = 0;
    size_t          linearAddress          = (size_t)sysMem;
    size_t          linearAddressAligned   = 0;

    CmKernel        *kernel            = nullptr;
    CmBufferUP      *cmbufferUP        = nullptr;
    SurfaceIndex    *bufferIndexCM     = nullptr;
    SurfaceIndex    *surf2DIndexCM     = nullptr;
    CmThreadSpace   *threadSpace                = nullptr;
    CmTask          *gpuCopyTask       = nullptr;
    CmEvent         *internalEvent     = nullptr;

    uint32_t        threadWidth             = 0;
    uint32_t        threadHeight            = 0;
    uint32_t        threadNum               = 0;
    uint32_t        widthDword             = 0;
    uint32_t        widthByte              = 0;
    uint32_t        copyWidthByte         = 0;
    uint32_t        copyHeightRow         = 0;
    uint32_t        sliceCopyHeightRow   = 0;
    uint32_t        sliceCopyBufferUPSize   = 0;
    int32_t         totalBufferUPSize       = 0;
    uint32_t        startX                 = 0;
    uint32_t        startY                 = 0;
    bool            blSingleEnqueue         = true;
    CM_GPUCOPY_KERNEL *gpuCopyKernelParam     = nullptr;

    PCM_HAL_STATE   cmHalState    =        \
        ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;

    widthByte    = widthInPixel * sizePerPixel;

    //Align the width regarding stride
   if(strideInBytes == 0)
   {
        strideInBytes = widthByte;
   }

   if(heightStrideInRows == 0)
   {
        heightStrideInRows = heightInRow;
   }

    // the actual copy region
    copyWidthByte = MOS_MIN(strideInBytes, widthByte);
    copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow);

    // Make sure stride and start address of system memory is 16-byte aligned.
    // if no padding in system memory , strideInBytes = widthByte.
    if(strideInBytes & 0xf)
    {
        CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned.");
        return CM_GPUCOPY_INVALID_STRIDE;
    }
    if((linearAddress & 0xf) || (linearAddress == 0))
    {
        CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
        return CM_GPUCOPY_INVALID_SYSMEM;
    }

    //Calculate actual total size of system memory
    totalBufferUPSize = strideInBytes * heightStrideInRows;

    //Check thread space width here
    if( copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH *4 )
    {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
        CM_ASSERTMESSAGE("Error: Invalid copy size.");
        return CM_GPUCOPY_INVALID_SIZE;
    }

    while (totalBufferUPSize > 0)
    {
        if (sizeof (void *) == 8 ) //64-bit
        {
            linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
        }
        else  //32-bit
        {
            linearAddressAligned        = linearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
        }

        //Calculate  Left Shift offset
        addedShiftLeftOffset = (uint32_t)(linearAddress - linearAddressAligned);
        totalBufferUPSize   += addedShiftLeftOffset;

        if (totalBufferUPSize > CM_MAX_1D_SURF_WIDTH)
        {
            blSingleEnqueue = false;
            sliceCopyHeightRow = ((CM_MAX_1D_SURF_WIDTH - addedShiftLeftOffset)/(strideInBytes*(BLOCK_HEIGHT * INNER_LOOP))) * (BLOCK_HEIGHT * INNER_LOOP);
            sliceCopyBufferUPSize = sliceCopyHeightRow * strideInBytes + addedShiftLeftOffset;
            tempHeight = sliceCopyHeightRow;
        }
        else
        {
            sliceCopyHeightRow = copyHeightRow;
            sliceCopyBufferUPSize = totalBufferUPSize;
            if (!blSingleEnqueue)
            {
                tempHeight = sliceCopyHeightRow;
            }
        }

        //Check thread space height here
        if(sliceCopyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP )
        {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
            CM_ASSERTMESSAGE("Error: Invalid copy size.");
            return CM_GPUCOPY_INVALID_SIZE;
        }

        kernel = nullptr;
        CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateBufferUP(  sliceCopyBufferUPSize, ( void * )linearAddressAligned, cmbufferUP ));
        CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP);

        //Configure memory object control for BufferUP to solve the cache-line issue.
        if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired())
        {
            CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
        }
        CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, sliceCopyHeightRow, format, direction, gpuCopyKernelParam));
        CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
        kernel = gpuCopyKernelParam->kernel;

        CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);

        CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUP);
        CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUP->GetIndex( bufferIndexCM ));
        CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex( surf2DIndexCM ));

        threadWidth = ( uint32_t )ceil( ( double )copyWidthByte/BLOCK_PIXEL_WIDTH/4 );
        threadHeight = ( uint32_t )ceil( ( double )sliceCopyHeightRow/BLOCK_HEIGHT/INNER_LOOP );
        threadNum = threadWidth * threadHeight;
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));
        CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));

        if( direction == CM_FASTCOPY_CPU2GPU)
        {
            if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
            {
                CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED));
            }
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), bufferIndexCM) );
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surf2DIndexCM ));
        }
        else
        {
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), bufferIndexCM ));
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surf2DIndexCM ));
        }

        if(direction == CM_FASTCOPY_GPU2CPU)
        {
            surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag
        }

        widthDword = (uint32_t)ceil((double)widthByte / 4);
        strideInDwords = (uint32_t)ceil((double)strideInBytes / 4);

        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( uint32_t ), &strideInDwords ));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( uint32_t ), &heightStrideInRows ));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( uint32_t ), &addedShiftLeftOffset ));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( uint32_t ), &threadHeight ));

        if (direction == CM_FASTCOPY_GPU2CPU)  //GPU-->CPU, read
        {
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &widthDword ));
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &tempHeight ));
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 8, sizeof(uint32_t), &startX));
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 9, sizeof(uint32_t), &startY));
        }
        else  //CPU-->GPU, write
        {
            //this only works for the kernel surfaceCopy_write_32x32
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( uint32_t ), &startX ));
            CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 7, sizeof( uint32_t ), &startY ));
        }

        CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
        CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));
        if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
        {
            // disable turbo
            CM_TASK_CONFIG taskConfig;
            CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
            taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
            gpuCopyTask->SetProperty(taskConfig);
        }
        CM_CHK_CMSTATUS_GOTOFINISH(Enqueue(gpuCopyTask, internalEvent,
                                           threadSpace));

        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);

        //update for next slice
        linearAddress += sliceCopyBufferUPSize - addedShiftLeftOffset;
        totalBufferUPSize -= sliceCopyBufferUPSize;
        copyHeightRow -= sliceCopyHeightRow;
        startX = 0;
        startY += sliceCopyHeightRow;

        if(totalBufferUPSize > 0)   //Intermediate event, we don't need it
        {
            CM_CHK_CMSTATUS_GOTOFINISH(DestroyEvent(internalEvent));
        }
        else //Last one event, need keep or destroy it
        {
            if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent))
            {
                CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished());
            }

            if(event == CM_NO_EVENT)  //User doesn't need CmEvent for this copy
            {
                event = nullptr;
                CM_CHK_CMSTATUS_GOTOFINISH(DestroyEvent(internalEvent));
            }
            else //User needs this CmEvent
            {
                event = internalEvent;
            }
        }

        CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
        CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
        CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUP));
    }

finish:

    if(hr != CM_SUCCESS)
    {
        if(cmbufferUP == nullptr)
        {
            // user need to know whether the failure is caused by out of BufferUP.
            hr = CM_GPUCOPY_OUT_OF_RESOURCE;
        }

        if(kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
        if(threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
        if(gpuCopyTask)                       m_device->DestroyTask(gpuCopyTask);
        if(cmbufferUP)                        m_device->DestroyBufferUP(cmbufferUP);
        if(internalEvent)                     DestroyEvent(internalEvent);

        // CM_FAILURE for all the other errors
        // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens
        if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT)
        {
            hr = CM_FAILURE;
        }
    }

    return hr;
}

int32_t CmQueueRT::EnqueueCopyInternal_2Planes(CmSurface2DRT* surface,
                                        unsigned char* sysMem,
                                        CM_SURFACE_FORMAT format,
                                        const uint32_t widthInPixel,
                                        const uint32_t widthStride,
                                        const uint32_t heightInRow,
                                        const uint32_t heightStride,
                                        const uint32_t sizePerPixel,
                                        CM_GPUCOPY_DIRECTION direction,
                                        const uint32_t option,
                                        CmEvent* & event)
{
    int32_t         hr                      = CM_SUCCESS;
    uint32_t        strideInBytes         = widthStride;
    uint32_t        strideInDwords        = 0;
    uint32_t        heightStrideInRows   = heightStride;
    size_t          linearAddressY        = 0;
    size_t          linearAddressUV       = 0;
    size_t          linearAddressAlignedY = 0;
    size_t          linearAddressAlignedUV = 0;
    uint32_t        addedShiftLeftOffsetY  = 0;
    uint32_t        addedShiftLeftOffsetUV = 0;

    CmKernel        *kernel                = nullptr;
    CmBufferUP      *cmbufferUPY          = nullptr;
    CmBufferUP      *cmbufferUPUV         = nullptr;
    SurfaceIndex    *bufferUPIndexY       = nullptr;
    SurfaceIndex    *bufferUPIndexUV      = nullptr;
    SurfaceIndex    *surf2DIndexCM         = nullptr;
    CmThreadSpace   *threadSpace           = nullptr;
    CmTask          *gpuCopyTask           = nullptr;
    CmEvent         *internalEvent         = nullptr;

    uint32_t        threadWidth             = 0;
    uint32_t        threadHeight            = 0;
    uint32_t        threadNum               = 0;
    uint32_t        widthDword             = 0;
    uint32_t        widthByte              = 0;
    uint32_t        copyWidthByte         = 0;
    uint32_t        copyHeightRow         = 0;
    uint32_t        bufferUPYSize         = 0;
    uint32_t        bufferUPUVSize        = 0;

    CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;
    PCM_HAL_STATE       cmHalState    =      \
        ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;

    widthByte = widthInPixel * sizePerPixel;

    //Align the width regarding stride
    if (strideInBytes == 0)
    {
        strideInBytes = widthByte;
    }

    if (heightStrideInRows == 0)
    {
        heightStrideInRows = heightInRow;
    }

    // the actual copy region
    copyWidthByte = MOS_MIN(strideInBytes, widthByte);
    copyHeightRow = MOS_MIN(heightStrideInRows, heightInRow);

    // Make sure stride and start address of system memory is 16-byte aligned.
    // if no padding in system memory , strideInBytes = widthByte.
    if (strideInBytes & 0xf)
    {
        CM_ASSERTMESSAGE("Error: Stride is not 16-byte aligned.");
        return CM_GPUCOPY_INVALID_STRIDE;
    }

    //Check thread space width here
    if (copyWidthByte > CM_MAX_THREADSPACE_WIDTH_FOR_MW * BLOCK_PIXEL_WIDTH * 4)
    {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
        CM_ASSERTMESSAGE("Error: Invalid copy size.");
        return CM_GPUCOPY_INVALID_SIZE;
    }

    linearAddressY = (size_t)sysMem;
    linearAddressUV = (size_t)((char*)sysMem + strideInBytes * heightStrideInRows);

    if ((linearAddressY & 0xf) || (linearAddressY == 0) || (linearAddressAlignedUV & 0xf))
    {
        CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
        return CM_GPUCOPY_INVALID_SYSMEM;
    }

    if (sizeof (void *) == 8) //64-bit
    {
        linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
        linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X64;
    }
    else  //32-bit
    {
        linearAddressAlignedY = linearAddressY & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
        linearAddressAlignedUV = linearAddressUV & ADDRESS_PAGE_ALIGNMENT_MASK_X86;
    }

    //Calculate  Left Shift offset
    addedShiftLeftOffsetY = (uint32_t)(linearAddressY - linearAddressAlignedY);
    addedShiftLeftOffsetUV = (uint32_t)(linearAddressUV - linearAddressAlignedUV);

    //Calculate actual total size of system memory, assume it's NV12/P010/P016 formats
    bufferUPYSize = strideInBytes * heightStrideInRows + addedShiftLeftOffsetY;
    bufferUPUVSize = strideInBytes * copyHeightRow * 1 / 2 + addedShiftLeftOffsetUV;

    //Check thread space height here
    if (copyHeightRow > CM_MAX_THREADSPACE_HEIGHT_FOR_MW * BLOCK_HEIGHT * INNER_LOOP)
    {  // each thread handles 128x8 block data. This API will fail if it exceeds the max thread space's size
        CM_ASSERTMESSAGE("Error: Invalid copy size.");
        return CM_GPUCOPY_INVALID_SIZE;
    }

    kernel = nullptr;
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPYSize, (void *)linearAddressAlignedY, cmbufferUPY));
    CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY);
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(bufferUPUVSize, (void *)linearAddressAlignedUV, cmbufferUPUV));
    CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV);

    //Configure memory object control for the two BufferUP to solve the same cache-line coherency issue.
    if (cmHalState->cmHalInterface->IsGPUCopySurfaceNoCacheWARequired())
    {
        CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
        CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->SelectMemoryObjectControlSetting(MEMORY_OBJECT_CONTROL_SKL_NO_LLC_L3));
    }
    else
    {
        CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPY)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0));
        CM_CHK_CMSTATUS_GOTOFINISH(static_cast< CmBuffer_RT* >(cmbufferUPUV)->SetMemoryObjectControl(MEMORY_OBJECT_CONTROL_FROM_GTT_ENTRY, CM_WRITE_THROUGH, 0));
    }

    CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(copyWidthByte, copyHeightRow, format, direction, gpuCopyKernelParam));
    CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
    kernel = gpuCopyKernelParam->kernel;

    CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);

    CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPY);
    CM_CHK_NULL_GOTOFINISH_CMERROR(cmbufferUPUV);
    CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPY->GetIndex(bufferUPIndexY));
    CM_CHK_CMSTATUS_GOTOFINISH(cmbufferUPUV->GetIndex(bufferUPIndexUV));
    CM_CHK_CMSTATUS_GOTOFINISH(surface->GetIndex(surf2DIndexCM));

    threadWidth = (uint32_t)ceil((double)copyWidthByte / BLOCK_PIXEL_WIDTH / 4);
    threadHeight = (uint32_t)ceil((double)copyHeightRow / BLOCK_HEIGHT / INNER_LOOP);
    threadNum = threadWidth * threadHeight;
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadNum));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));

    widthDword = (uint32_t)ceil((double)widthByte / 4);
    strideInDwords = (uint32_t)ceil((double)strideInBytes / 4);

    if (direction == CM_FASTCOPY_CPU2GPU) //Write
    {
        //Input BufferUP_Y and BufferUP_UV
        if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
        {
            CM_CHK_CMSTATUS_GOTOFINISH(surface->SetCompressionMode(MEMCOMP_DISABLED));
        }
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), bufferUPIndexY));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexUV));
        //Output Surface2D
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), surf2DIndexCM));
        //Other parameters
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight));
    }
    else  //Read
    {
        //Input Surface2D
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surf2DIndexCM));
        //Output BufferUP_Y and BufferUP_UV
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), bufferUPIndexY));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(SurfaceIndex), bufferUPIndexUV));
        //Other parameters
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(3, sizeof(uint32_t), &strideInDwords));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(4, sizeof(uint32_t), &heightStrideInRows));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(5, sizeof(uint32_t), &addedShiftLeftOffsetY));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(6, sizeof(uint32_t), &addedShiftLeftOffsetUV));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(7, sizeof(uint32_t), &threadHeight));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(8, sizeof(uint32_t), &widthDword));
        CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(9, sizeof(uint32_t), &heightInRow));

        surface->SetReadSyncFlag(true, this); // GPU -> CPU, set surf2d as read sync flag
    }

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
    CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel(kernel));
    if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
    {
        // disable turbo
        CM_TASK_CONFIG taskConfig;
        CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
        taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
        gpuCopyTask->SetProperty(taskConfig);
    }
    CM_CHK_CMSTATUS_GOTOFINISH(Enqueue(gpuCopyTask, internalEvent,
                                       threadSpace));

    GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);

    if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (internalEvent))
    {
        CM_CHK_CMSTATUS_GOTOFINISH(internalEvent->WaitForTaskFinished());
    }

    if (event == CM_NO_EVENT)  //User doesn't need CmEvent for this copy
    {
        event = nullptr;
        CM_CHK_CMSTATUS_GOTOFINISH(DestroyEvent(internalEvent));
    }
    else //User needs this CmEvent
    {
        event = internalEvent;
    }

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(gpuCopyTask));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPY));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(cmbufferUPUV));

finish:

    if (hr != CM_SUCCESS)
    {
        if ((cmbufferUPY == nullptr) || (cmbufferUPUV == nullptr))
        {
            // user need to know whether the failure is caused by out of BufferUP.
            hr = CM_GPUCOPY_OUT_OF_RESOURCE;
        }

        if (kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
        if (threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
        if (gpuCopyTask)                       m_device->DestroyTask(gpuCopyTask);
        if (cmbufferUPY)                      m_device->DestroyBufferUP(cmbufferUPY);
        if (cmbufferUPUV)                     m_device->DestroyBufferUP(cmbufferUPUV);
        if (internalEvent)                     DestroyEvent(internalEvent);

        // CM_FAILURE for all the other errors
        // return CM_EXCEED_MAX_TIMEOUT to notify app that gpu reset happens
        if( hr != CM_GPUCOPY_OUT_OF_RESOURCE && hr != CM_EXCEED_MAX_TIMEOUT)
        {
            hr = CM_FAILURE;
        }
    }

    return hr;
}

//*-----------------------------------------------------------------------------
//! Enqueue an task, which contains one pre-defined kernel to copy from video memory to video memory
//! This is a non-blocking call. i.e. it returns immediately without waiting for
//! GPU to finish the execution of the task.
//! A CmEvent is generated each time a task is enqueued. The CmEvent can
//! be used to check if the task finishes.
//! INPUT:
//!     1) Pointer to the CmSurface2D as copy destination
//!     2) Pointer to the CmSurface2D  as copy source
//!     3) Option passed from user, blocking copy, non-blocking copy or disable turbo boost
//!     4) Reference to the pointer to CMEvent
//! OUTPUT:
//!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
//!     CM_OUT_OF_HOST_MEMORY if out of host memery;
//!     CM_GPUCOPY_INVALID_SURFACES if input/output surfaces' width/format are different or
//!                                 input surface's height is larger than output surface's
//! Restrictions:
//!     1) Surface's width should be 64-byte aligned.
//!     2) The input surface's width/height/format should be the same as output surface's.
//*-----------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToGPU( CmSurface2D* outputSurface, CmSurface2D* inputSurface, uint32_t option, CmEvent* & event )
{
    INSERT_API_CALL_LOG();

    if (!m_device->HasGpuCopyKernel())
    {
        return CM_NOT_IMPLEMENTED;
    }

    uint32_t srcSurfaceWidth = 0;
    uint32_t srcSurfaceHeight = 0;
    uint32_t dstSurfaceWidth = 0;
    uint32_t dstSurfaceHeight = 0;

    CM_SURFACE_FORMAT srcSurfaceFormat = CM_SURFACE_FORMAT_INVALID;
    CM_SURFACE_FORMAT dstSurfaceFormat = CM_SURFACE_FORMAT_INVALID;

    int32_t             hr = CM_SUCCESS;
    uint32_t            srcSizePerPixel = 0;
    uint32_t            dstSizePerPixel = 0;
    uint32_t            threadWidth = 0;
    uint32_t            threadHeight = 0;

    CmKernel            *kernel = nullptr;
    SurfaceIndex        *surfaceInputIndex = nullptr;
    SurfaceIndex        *surfaceOutputIndex = nullptr;
    CmThreadSpace       *threadSpace = nullptr;
    CmTask              *task = nullptr;
    uint32_t            srcSurfAlignedWidthInBytes = 0;
    CM_GPUCOPY_KERNEL *gpuCopyKernelParam = nullptr;

    if ((outputSurface == nullptr) || (inputSurface == nullptr))
    {
        CM_ASSERTMESSAGE("Error: Pointer to input surface or output surface is null.");
        return CM_FAILURE;
    }

    PCM_HAL_STATE   cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
    CmSurface2DRT *outputSurfaceRT = static_cast<CmSurface2DRT *>(outputSurface);
    CmSurface2DRT *inputSurfaceRT = static_cast<CmSurface2DRT *>(inputSurface);
    if (cmHalState->cmHalInterface->IsSurfaceCompressionWARequired())
    {
        CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->SetCompressionMode(MEMCOMP_DISABLED));
    }

    CM_CHK_CMSTATUS_GOTOFINISH(outputSurfaceRT->GetSurfaceDesc(dstSurfaceWidth, dstSurfaceHeight, dstSurfaceFormat, dstSizePerPixel));
    CM_CHK_CMSTATUS_GOTOFINISH(inputSurfaceRT->GetSurfaceDesc(srcSurfaceWidth, srcSurfaceHeight, srcSurfaceFormat, srcSizePerPixel));

    if ((dstSurfaceWidth != srcSurfaceWidth) ||
        (dstSurfaceHeight < srcSurfaceHeight) ||  //relax the restriction
        (dstSizePerPixel != srcSizePerPixel))
    {
        CM_ASSERTMESSAGE("Error: Size of dest surface does not match src surface.");
        return CM_GPUCOPY_INVALID_SURFACES;
    }

    //To support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8
    if (dstSurfaceFormat != srcSurfaceFormat)
    {
        if (!((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8)) &&
            !((dstSurfaceFormat == CM_SURFACE_FORMAT_A8R8G8B8) && (srcSurfaceFormat == CM_SURFACE_FORMAT_A8B8G8R8)))
        {
            CM_ASSERTMESSAGE("Error: Only support copy b/w Format_A8R8G8B8 and Format_A8B8G8R8 if src format is not matched with dst format.");
            return CM_GPUCOPY_INVALID_SURFACES;
        }
    }

    // 128Bytes aligned
    srcSurfAlignedWidthInBytes = (uint32_t)(ceil((double)srcSurfaceWidth*srcSizePerPixel / BLOCK_PIXEL_WIDTH / 4) * (BLOCK_PIXEL_WIDTH * 4));

    if (srcSurfaceHeight > CM_MAX_THREADSPACE_WIDTH_FOR_MW *BLOCK_HEIGHT *INNER_LOOP)
    {
        CM_ASSERTMESSAGE("Error: Invalid copy size.");
        return CM_GPUCOPY_INVALID_SIZE;
    }

    CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(srcSurfaceWidth*srcSizePerPixel, srcSurfaceHeight, srcSurfaceFormat, CM_FASTCOPY_GPU2GPU, gpuCopyKernelParam));
    CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);

    CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
    kernel = gpuCopyKernelParam->kernel;

    CM_CHK_CMSTATUS_GOTOFINISH(inputSurface->GetIndex(surfaceInputIndex));
    CM_CHK_CMSTATUS_GOTOFINISH(outputSurface->GetIndex(surfaceOutputIndex));

    threadWidth = srcSurfAlignedWidthInBytes / (BLOCK_PIXEL_WIDTH * 4);
    threadHeight = (uint32_t)ceil((double)srcSurfaceHeight / BLOCK_HEIGHT / INNER_LOOP);

    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight));

    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(0, sizeof(SurfaceIndex), surfaceInputIndex));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(1, sizeof(SurfaceIndex), surfaceOutputIndex));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg(2, sizeof(uint32_t), &threadHeight));

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
    CM_CHK_NULL_GOTOFINISH_CMERROR(task);
    CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel(kernel));

    if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
    {
        // disable turbo
        CM_TASK_CONFIG taskConfig;
        CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
        taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
        task->SetProperty(taskConfig);
    }

    CM_CHK_CMSTATUS_GOTOFINISH(Enqueue(task, event, threadSpace));
    if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event))
    {
        CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished());
    }

finish:

    if (kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
    if (threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
    if (task)                              m_device->DestroyTask(task);

    return hr;
}

//*-----------------------------------------------------------------------------
//! Enqueue an task, which contains one pre-defined kernel to copy from system memory to system memory
//! This is a non-blocking call. i.e. it returns immediately without waiting for
//! GPU to finish the execution of the task.
//! A CmEvent is generated each time a task is enqueued. The CmEvent can be used to check if the task finishs.
//! If the size is less than 1KB,  CPU is used to do the copy and event will be set as nullptr .
//!
//! INPUT:
//!     1) Pointer to the system memory as copy destination
//!     2) Pointer to the system memory as copy source
//!     3) The size in bytes of memory be copied.
//!     4) Option passed from user, blocking copy, non-blocking copy or disable turbo boost
//!     5) Reference to the pointer to CMEvent
//! OUTPUT:
//!     CM_SUCCESS if the task is successfully enqueued and the CmEvent is generated;
//!     CM_OUT_OF_HOST_MEMORY if out of host memery;
//!     CM_GPUCOPY_INVALID_SYSMEM if the sysMem is not 16-byte aligned or is NULL.
//!     CM_GPUCOPY_OUT_OF_RESOURCE if runtime run out of BufferUP.
//!     CM_GPUCOPY_INVALID_SIZE  if its size plus shift-left offset large than CM_MAX_1D_SURF_WIDTH.
//! Restrictions:
//!     1) dstSysMem and srcSysMem should be 16-byte aligned.
//*-----------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToCPU( unsigned char* dstSysMem, unsigned char* srcSysMem, uint32_t size, uint32_t option, CmEvent* & event )
{
    INSERT_API_CALL_LOG();

    if (!m_device->HasGpuCopyKernel())
    {
        return CM_NOT_IMPLEMENTED;
    }

    int hr = CM_SUCCESS;
    size_t inputLinearAddress  = (size_t )srcSysMem;
    size_t outputLinearAddress = (size_t )dstSysMem;

    size_t inputLinearAddressAligned = 0;
    size_t outputLinearAddressAligned = 0;

    CmBufferUP      *surfaceInput          = nullptr;
    CmBufferUP      *surfaceOutput         = nullptr;
    CmKernel        *kernel                = nullptr;
    SurfaceIndex    *surfaceInputIndex     = nullptr;
    SurfaceIndex    *surfaceOutputIndex    = nullptr;
    CmThreadSpace   *threadSpace                    = nullptr;
    CmTask          *task                  = nullptr;

    int32_t         srcLeftShiftOffset      = 0;
    int32_t         dstLeftShiftOffset      = 0;
    uint32_t        threadWidth             = 0;
    uint32_t        threadHeight            = 0;
    uint32_t        threadNum              = 0;
    uint32_t        gpuMemcopySize        = 0;
    uint32_t        cpuMemcopySize        = 0;
    CM_GPUCOPY_KERNEL *gpuCopyKernelParam     = nullptr;

    if((inputLinearAddress & 0xf) || (outputLinearAddress & 0xf) ||
        (inputLinearAddress == 0) || (outputLinearAddress == 0))
    {
        CM_ASSERTMESSAGE("Error: Start address of system memory is not 16-byte aligned.");
        return CM_GPUCOPY_INVALID_SYSMEM;
    }

    // Get page aligned address
    if (sizeof (void *) == 8 ) //64-bit
    {
        inputLinearAddressAligned  = inputLinearAddress  & ADDRESS_PAGE_ALIGNMENT_MASK_X64;  // make sure the address page aligned.
        outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X64;  // make sure the address page aligned.
    }
    else
    {
        inputLinearAddressAligned  = inputLinearAddress  & ADDRESS_PAGE_ALIGNMENT_MASK_X86;  // make sure the address page aligned.
        outputLinearAddressAligned = outputLinearAddress & ADDRESS_PAGE_ALIGNMENT_MASK_X86;  // make sure the address page aligned.
    }

    srcLeftShiftOffset = (int32_t)(inputLinearAddress  - inputLinearAddressAligned) ;
    dstLeftShiftOffset = (int32_t)(outputLinearAddress - outputLinearAddressAligned) ;

    if(((size + srcLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH)||
       ((size + dstLeftShiftOffset) > CM_MAX_1D_SURF_WIDTH))
    {
        CM_ASSERTMESSAGE("Error: Invalid copy size.");
        return CM_GPUCOPY_INVALID_SIZE;
    }

    threadWidth  = 0;
    threadHeight = 0;
    threadNum = size / BYTE_COPY_ONE_THREAD; // each thread copys 32 x 4 x32 bytes = 1K

    if( threadNum == 0)
    {
        //if the size of data is less than data copied per thread ( 4K), use CPU to copy it instead of GPU.
        CmFastMemCopy((void *)(outputLinearAddress),
                      (void *)(inputLinearAddress),
                      size); //SSE copy used in CMRT.

        event = nullptr;
        return CM_SUCCESS;
    }

    //Calculate proper thread space's width and height
    threadWidth  = 1;
    threadHeight = threadNum/threadWidth;
    while((threadHeight > CM_MAX_THREADSPACE_HEIGHT_FOR_MW))
    {
        if(threadWidth > CM_MAX_THREADSPACE_WIDTH_FOR_MW)
        {
            hr = CM_GPUCOPY_INVALID_SIZE; // thread number exceed 511*511
            goto finish;
        }
        else if (threadWidth == 1)
        {
            threadWidth  =  THREAD_SPACE_WIDTH_INCREMENT; // first time,
            threadHeight = threadNum/threadWidth;
        }
        else
        {
            threadWidth +=  THREAD_SPACE_WIDTH_INCREMENT; // increase 8 per iteration
            threadHeight = threadNum/threadWidth;
        }
    }

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + srcLeftShiftOffset, (void *)inputLinearAddressAligned,surfaceInput));

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateBufferUP(size + dstLeftShiftOffset, (void *)outputLinearAddressAligned,surfaceOutput));

    CM_CHK_CMSTATUS_GOTOFINISH(CreateGPUCopyKernel(size, 0, CM_SURFACE_FORMAT_INVALID, CM_FASTCOPY_CPU2CPU, gpuCopyKernelParam));
    CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
    CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam->kernel);
    kernel = gpuCopyKernelParam->kernel;

    CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceInput);
    CM_CHK_CMSTATUS_GOTOFINISH(surfaceInput->GetIndex(surfaceInputIndex));
    CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceOutput);
    CM_CHK_CMSTATUS_GOTOFINISH(surfaceOutput->GetIndex(surfaceOutputIndex));

    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount(threadWidth * threadHeight));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( SurfaceIndex ), surfaceInputIndex ));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), surfaceOutputIndex ));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 2, sizeof( int ), &threadWidth ));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 3, sizeof( int ), &threadHeight ));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 4, sizeof( int ), &srcLeftShiftOffset ));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 5, sizeof( int ), &dstLeftShiftOffset ));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 6, sizeof( int ), &size ));

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace(threadWidth, threadHeight, threadSpace));

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(task));
    CM_CHK_NULL_GOTOFINISH_CMERROR(task);
    CM_CHK_CMSTATUS_GOTOFINISH(task->AddKernel (kernel));

    if (option & CM_FASTCOPY_OPTION_DISABLE_TURBO_BOOST)
    {
        // disable turbo
        CM_TASK_CONFIG taskConfig;
        CmSafeMemSet(&taskConfig, 0, sizeof(CM_TASK_CONFIG));
        taskConfig.turboBoostFlag = CM_TURBO_BOOST_DISABLE;
        task->SetProperty(taskConfig);
    }

    CM_CHK_CMSTATUS_GOTOFINISH(Enqueue(task, event, threadSpace));

    if ((option & CM_FASTCOPY_OPTION_BLOCKING) && (event))
    {
        CM_CHK_CMSTATUS_GOTOFINISH(event->WaitForTaskFinished());
    }

    //Copy the unaligned part by using CPU
    gpuMemcopySize = threadHeight * threadWidth *BYTE_COPY_ONE_THREAD;
    cpuMemcopySize = size - threadHeight * threadWidth *BYTE_COPY_ONE_THREAD;

    CmFastMemCopy((void *)(outputLinearAddress+gpuMemcopySize),
                  (void *)(inputLinearAddress+gpuMemcopySize),
                          cpuMemcopySize); //SSE copy used in CMRT.

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyThreadSpace(threadSpace));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyTask(task));
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceOutput));   // ref_cnf to guarantee task finish before BufferUP being really destroy.
    CM_CHK_CMSTATUS_GOTOFINISH(m_device->DestroyBufferUP(surfaceInput));

    GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);

finish:
    if(hr != CM_SUCCESS)
    {   //Failed
        if( surfaceInput == nullptr || surfaceOutput == nullptr)
        {
            hr = CM_GPUCOPY_OUT_OF_RESOURCE; // user need to know whether the failure is caused by out of BufferUP.
        }
        else
        {
            hr = CM_FAILURE;
        }
        if(surfaceInput)                      m_device->DestroyBufferUP(surfaceInput);
        if(surfaceOutput)                     m_device->DestroyBufferUP(surfaceOutput);
        if(kernel && gpuCopyKernelParam)        GPUCOPY_KERNEL_UNLOCK(gpuCopyKernelParam);
        if(threadSpace)                                m_device->DestroyThreadSpace(threadSpace);
        if(task)                              m_device->DestroyTask(task);
    }

    return hr;
}

//*----------------------------------------------------------------------------------------
//| Purpose:    Pop task from flushed Queue, Update surface state and Destroy the task
//| Notes:
//*----------------------------------------------------------------------------------------
void CmQueueRT::PopTaskFromFlushedQueue()
{
    CmTaskInternal* topTask = (CmTaskInternal*)m_flushedTasks.Pop();

    if ( topTask != nullptr )
    {
        CmEventRT *event = nullptr;
        topTask->GetTaskEvent( event );
        if ( event != nullptr )
        {
            LARGE_INTEGER nTime;
            if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nTime.QuadPart )) )
            {
                CM_ASSERTMESSAGE("Error: Query performace counter failure.");
            }
            else
            {
                event->SetCompleteTime( nTime );
            }
        }

#if MDF_SURFACE_CONTENT_DUMP
        PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
        if (cmData->cmHalState->dumpSurfaceContent)
        {
            int32_t taskId = 0;
            if (event != nullptr)
            {
                event->GetTaskDriverId(taskId);
            }
            topTask->SurfaceDump(taskId);
        }
#endif

        CmTaskInternal::Destroy( topTask );
    }
    return;
}

int32_t CmQueueRT::TouchFlushedTasks( )
{
    int32_t hr = CM_SUCCESS;

    if (m_flushedTasks.IsEmpty())
    {
        if (!m_enqueuedTasks.IsEmpty())
        {
            // if FlushedQueue is empty and EnqueuedQueue is not empty
            // try flush task to FlushedQueue
            hr = FlushTaskWithoutSync();
            if (FAILED(hr))
            {
                return hr;
            }
        }
        else
        {   // no task in flushedQueue and EnqueuedQueue
            return CM_FAILURE;
        }
    }

    // Flush FlushedQueue
    hr = QueryFlushedTasks();

    return hr;
}

//*-----------------------------------------------------------------------------
//! Flush the queue, i.e. submit all tasks in the queue to execute according
//! to their order in the the queue. The queue will be empty after flush,
//! This is a non-blocking call. i.e. it returns immediately without waiting for
//! GPU to finish the execution of tasks.
//! INPUT:
//! OUTPUT:
//!     CM_SUCCESS if all tasks in the queue are submitted
//!     CM_FAILURE otherwise.
//!     More error code is coming.
//!
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::QueryFlushedTasks()
{
    int32_t hr   = CM_SUCCESS;

    m_criticalSectionFlushedTask.Acquire();
    while( !m_flushedTasks.IsEmpty() )
    {
        CmTaskInternal* task = (CmTaskInternal*)m_flushedTasks.Top();
        CM_CHK_NULL_GOTOFINISH_CMERROR(task);

        CM_STATUS status = CM_STATUS_FLUSHED ;
        task->GetTaskStatus(status);
        if( status == CM_STATUS_FINISHED )
        {
            PopTaskFromFlushedQueue();
        }
        else
        {
            // media reset
            if (status == CM_STATUS_RESET)
            {
                PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();

                // Clear task status table in Cm Hal State
                int32_t taskId;
                CmEventRT*pTopTaskEvent;
                task->GetTaskEvent(pTopTaskEvent);
                CM_CHK_NULL_GOTOFINISH_CMERROR(pTopTaskEvent);

                pTopTaskEvent->GetTaskDriverId(taskId);
                cmData->cmHalState->taskStatusTable[taskId] = CM_INVALID_INDEX;

                //Pop task and Destroy it
                PopTaskFromFlushedQueue();
            }

            // It is an in-order queue, if this one hasn't finshed,
            // the following ones haven't finished either.
            break;
        }
    }

finish:
    m_criticalSectionFlushedTask.Release();

    return hr;
}

//*-----------------------------------------------------------------------------
//! This is a blocking call. It will NOT return untill
//! all tasks in GPU and all tasks in queue finishes execution.
//! It will first flush the queue if the queue is not empty.
//! INPUT:
//! OUTPUT:
//!     CM_SUCCESS if all tasks finish execution.
//!     CM_FAILURE otherwise.
//!     More error code is coming.
//*-----------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::DestroyEvent( CmEvent* & event )
{

    CLock Lock(m_criticalSectionEvent);

    if (event == nullptr)
    {
        return CM_FAILURE;
    }

    uint32_t index = 0;

    CmEventRT *eventRT = static_cast<CmEventRT *>(event);
    eventRT->GetIndex(index);
    CM_ASSERT( m_eventArray.GetElement( index ) == eventRT );

    int32_t status = CmEventRT::Destroy( eventRT );
    if( status == CM_SUCCESS && eventRT == nullptr)
    {
        m_eventArray.SetElement(index, nullptr);
    }

    // Should return nullptr to application even the event is not destroyed
    // since its reference count is not zero
    event = nullptr;

    return status;
}

//*-----------------------------------------------------------------------------
//| Purpose:    Clean the Queue if its tasks time out
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::CleanQueue( )
{

    int32_t status = CM_SUCCESS;

    // Maybe not necessary since
    // it is called by ~CmDevice only
    // Update: necessary because it calls FlushBlockWithoutSync
    if( !m_enqueuedTasks.IsEmpty() )
    {
        // If there are tasks not flushed (i.e. not send to driver )
        // wait untill all such tasks are flushed
        FlushTaskWithoutSync( true );
    }
    CM_ASSERT( m_enqueuedTasks.IsEmpty() );

    //Used for timeout detection
    LARGE_INTEGER freq;
    MOS_QueryPerformanceFrequency((uint64_t*)&freq.QuadPart);
    LARGE_INTEGER start;
    MOS_QueryPerformanceCounter((uint64_t*)&start.QuadPart);
    int64_t timeout = start.QuadPart + (CM_MAX_TIMEOUT * freq.QuadPart * m_flushedTasks.GetCount()); //Count to timeout at

    while( !m_flushedTasks.IsEmpty() && status != CM_EXCEED_MAX_TIMEOUT )
    {
        QueryFlushedTasks();

        LARGE_INTEGER current;
        MOS_QueryPerformanceCounter((uint64_t*)&current.QuadPart);
        if( current.QuadPart > timeout )
            status = CM_EXCEED_MAX_TIMEOUT;
    }

    return status;
}

CM_QUEUE_CREATE_OPTION &CmQueueRT::GetQueueOption()
{
    return m_queueOption;
}

//*-----------------------------------------------------------------------------
//| Purpose:    Get the count of task in queue
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::GetTaskCount( uint32_t& numTasks )
{
    numTasks = m_enqueuedTasks.GetCount() + m_flushedTasks.GetCount();
    return CM_SUCCESS;
}

//*-----------------------------------------------------------------------------
//| Purpose:   Use GPU to init Surface2D
//| Returns:   result of operation
//*-----------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::EnqueueInitSurface2D( CmSurface2D* surf2D, const uint32_t initValue, CmEvent* &event)
{
    INSERT_API_CALL_LOG();

    if (!m_device->HasGpuInitKernel())
    {
        return CM_NOT_IMPLEMENTED;
    }

    int32_t         hr                      = CM_SUCCESS;
    uint32_t        width                   = 0;
    uint32_t        height                  = 0;
    uint32_t        sizePerPixel            = 0;
    CmProgram       *gpuInitKernelProgram  = nullptr;
    CmKernel        *kernel                = nullptr;
    SurfaceIndex    *outputIndexCM         = nullptr;
    CmThreadSpace   *threadSpace           = nullptr;
    CmTask          *gpuCopyTask           = nullptr;
    uint32_t        threadWidth             = 0;
    uint32_t        threadHeight            = 0;
    uint32_t        threadNum               = 0;
    CmSurfaceManager* surfaceMgr           = nullptr;
    CM_SURFACE_FORMAT      format           = CM_SURFACE_FORMAT_INVALID;

    if(!surf2D)
    {
        CM_ASSERTMESSAGE("Error: Pointer to surface 2d is null.");
        return CM_FAILURE;
    }
    CmSurface2DRT *surf2DRT = static_cast<CmSurface2DRT *>(surf2D);

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->LoadPredefinedInitKernel(gpuInitKernelProgram));

    CM_CHK_CMSTATUS_GOTOFINISH(surf2DRT->GetSurfaceDesc(width, height, format,sizePerPixel));

    m_device->GetSurfaceManager(surfaceMgr);
    CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceMgr);

    if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
    {
        CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set_NV12 ), kernel, "PredefinedGPUCopyKernel"));
    }
    else
    {
        CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuInitKernelProgram, _NAME( surfaceCopy_set ), kernel, "PredefinedGPUCopyKernel" ));
    }
    CM_CHK_NULL_GOTOFINISH_CMERROR(kernel);
    CM_CHK_CMSTATUS_GOTOFINISH(surf2D->GetIndex( outputIndexCM ));

    threadWidth = ( uint32_t )ceil( ( double )width*sizePerPixel/BLOCK_PIXEL_WIDTH/4 );
    threadHeight = ( uint32_t )ceil( ( double )height/BLOCK_HEIGHT );
    threadNum = threadWidth * threadHeight;
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetThreadCount( threadNum ));

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateThreadSpace( threadWidth, threadHeight, threadSpace ));
    CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpace);

    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 0, sizeof( uint32_t ), &initValue ));
    CM_CHK_CMSTATUS_GOTOFINISH(kernel->SetKernelArg( 1, sizeof( SurfaceIndex ), outputIndexCM ));

    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateTask(gpuCopyTask));
    CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyTask);

    CM_CHK_CMSTATUS_GOTOFINISH(gpuCopyTask->AddKernel( kernel ));

    CM_CHK_CMSTATUS_GOTOFINISH(Enqueue(gpuCopyTask, event, threadSpace));

finish:

    if (kernel)        m_device->DestroyKernel( kernel );
    if (gpuCopyTask)   m_device->DestroyTask(gpuCopyTask);
    if (threadSpace)            m_device->DestroyThreadSpace(threadSpace);

    return hr;
}

//*-----------------------------------------------------------------------------
//! Flush a geneal task to HAL CM layer for execution.
//! This is a non-blocking call. i.e. it returs immediately without waiting for
//! GPU to finish the execution of tasks.
//! INPUT: task -- Pointer to CmTaskInternal object
//! OUTPUT:
//!     CM_SUCCESS if all tasks in the queue are submitted
//!     CM_FAILURE otherwise.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::FlushGeneralTask(CmTaskInternal* task)
{
    CM_RETURN_CODE          hr              = CM_SUCCESS;
    CM_HAL_EXEC_TASK_PARAM  param;
    PCM_HAL_KERNEL_PARAM    kernelParam    = nullptr;
    CmKernelData*           kernelData     = nullptr;
    uint32_t                kernelDataSize  = 0;
    PCM_CONTEXT_DATA        cmData         = nullptr;
    CmEventRT*              event          = nullptr;
    uint32_t                totalThreadCount= 0;
    uint32_t                count           = 0;
    PCM_HAL_KERNEL_PARAM    tempData       = nullptr;
    uint32_t                maxTSWidth      = 0;
    bool                    hasThreadArg    = false;

    CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_TASK_PARAM ) );

    //GT-PIN
    if(m_device->CheckGTPinEnabled())
    {
        CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surfEntryInfoArrays));
    }

    task->GetKernelCount( count );
    param.numKernels = count;

    param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM,count);
    param.kernelSizes = MOS_NewArray(uint32_t,count);
    param.kernelCurbeOffset = MOS_NewArray(uint32_t,count);
    param.queueOption = m_queueOption;

    CM_CHK_NULL_GOTOFINISH(param.kernels, CM_OUT_OF_HOST_MEMORY);
    CM_CHK_NULL_GOTOFINISH(param.kernelSizes, CM_OUT_OF_HOST_MEMORY);
    CM_CHK_NULL_GOTOFINISH(param.kernelCurbeOffset, CM_OUT_OF_HOST_MEMORY);

    for( uint32_t i = 0; i < count; i ++ )
    {
        task->GetKernelData( i, kernelData );
        CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);

        kernelParam = kernelData->GetHalCmKernelData();
        CM_CHK_NULL_GOTOFINISH_CMERROR(kernelParam);

        hasThreadArg |= kernelParam->perThreadArgExisted;

        task->GetKernelDataSize( i, kernelDataSize );
        if(kernelDataSize == 0)
        {
            CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
            hr = CM_FAILURE;
            goto finish;
        }

        tempData = kernelData->GetHalCmKernelData();

        param.kernels[ i ]             = tempData;
        param.kernelSizes[ i ]        = kernelDataSize;
        param.kernelCurbeOffset[ i ]  = task->GetKernelCurbeOffset(i);
        param.globalSurfaceUsed       |= tempData->globalSurfaceUsed;
        param.kernelDebugEnabled      |= tempData->kernelDebugEnabled;
    }

    /*
    * Preset the default TS width/height/dependency:
    *     TS width   = MOS_MIN(CM_MAX_THREADSPACE_WIDTH, threadcount)
    *     TS height  = totalThreadCount/CM_MAX_THREADSPACE_WIDTH + 1
    *     dependency = CM_NONE_DEPENDENCY
    * For threadSpace is nullptr case, we will pass the default TS width/height/dependency to driver
    * For threadSpace is valid case, the TS width/height/dependency will be update according to thread space set by user.
    */
    task->GetTotalThreadCount(totalThreadCount);

    if (hasThreadArg)
    {
        maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW + 1; // 512 allowed for media object
    }
    else
    {
        maxTSWidth = CM_MAX_THREADSPACE_WIDTH_FOR_MW; // 511 for media walker
    }

    param.threadSpaceWidth = (totalThreadCount > maxTSWidth) ? maxTSWidth : totalThreadCount;
    if(totalThreadCount%maxTSWidth)
    {
        param.threadSpaceHeight = totalThreadCount/maxTSWidth + 1;
    }
    else
    {
        param.threadSpaceHeight = totalThreadCount/maxTSWidth;
    }

    param.dependencyPattern = CM_NONE_DEPENDENCY;

    if (task->IsThreadSpaceCreated()) //scoreboard data preparation
    {
        if(task->IsThreadCoordinatesExisted())
        {
            param.threadCoordinates = MOS_NewArray(PCM_HAL_SCOREBOARD, count);
            param.dependencyMasks = MOS_NewArray(PCM_HAL_MASK_AND_RESET, count);

            CM_CHK_NULL_GOTOFINISH(param.threadCoordinates, CM_OUT_OF_HOST_MEMORY);
            CM_CHK_NULL_GOTOFINISH(param.dependencyMasks, CM_OUT_OF_HOST_MEMORY);
            for(uint32_t i=0; i<count; i++)
            {
                void *kernelCoordinates = nullptr;
                void *dependencyMasks = nullptr;
                task->GetKernelCoordinates(i, kernelCoordinates);
                task->GetKernelDependencyMasks(i, dependencyMasks);
                param.threadCoordinates[i] = (PCM_HAL_SCOREBOARD)kernelCoordinates;
                param.dependencyMasks[i] = (PCM_HAL_MASK_AND_RESET)dependencyMasks;
            }
        }
        else
        {
            param.threadCoordinates = nullptr;
        }

        task->GetDependencyPattern(param.dependencyPattern);

        task->GetThreadSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight);

        task->GetWalkingPattern(param.walkingPattern);

        if( task->CheckWalkingParametersSet( ) )
        {
            param.walkingParamsValid = 1;
            CM_CHK_CMSTATUS_GOTOFINISH(task->GetWalkingParameters(param.walkingParams));
        }
        else
        {
            param.walkingParamsValid = 0;
        }

        if( task->CheckDependencyVectorsSet( ) )
        {
            param.dependencyVectorsValid = 1;
            CM_CHK_CMSTATUS_GOTOFINISH(task->GetDependencyVectors(param.dependencyVectors));
        }
        else
        {
            param.dependencyVectorsValid = 0;
        }
    }
    if (param.threadSpaceWidth == 0)
    {
        CM_ASSERTMESSAGE("Error: Invalid thread space.");
        hr = CM_INVALID_THREAD_SPACE;
        goto finish;
    }
    task->GetColorCountMinusOne(param.colorCountMinusOne);
    task->GetMediaWalkerGroupSelect(param.mediaWalkerGroupSelect);

    param.syncBitmap = task->GetSyncBitmap();
    param.conditionalEndBitmap = task->GetConditionalEndBitmap();
    param.userDefinedMediaState = task->GetMediaStatePtr();
    CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo));
    CmSafeMemCopy(&param.taskConfig, task->GetTaskConfig(), sizeof(param.taskConfig));
    cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();

    CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption()));

    m_device->RegisterSyncEvent(nullptr);
    CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnExecuteTask(cmData->cmHalState, &param));

    if( param.taskIdOut < 0 )
    {
        CM_ASSERTMESSAGE("Error: Invalid task ID.");
        hr = CM_FAILURE;
        goto finish;
    }

    TASK_LOG(task);

    task->GetTaskEvent( event );
    CM_CHK_NULL_GOTOFINISH_CMERROR(event);
    CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
    CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
    CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());

    //GT-PIN
    if(m_device->CheckGTPinEnabled())
    {
        //No need to clear the SurEntryInfoArrays here. It will be destored by CmInternalTask
        CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surfEntryInfoArrays));
    }

finish:
    MosSafeDeleteArray( param.kernels );
    MosSafeDeleteArray( param.kernelSizes );
    MosSafeDeleteArray( param.threadCoordinates);
    MosSafeDeleteArray( param.dependencyMasks);
    MosSafeDeleteArray( param.kernelCurbeOffset);

    return hr;
}

//*-----------------------------------------------------------------------------
//! Flush a thread group based task to HAL CM layer for execution.
//! This is a non-blocking call. i.e. it returs immediately without waiting for
//! GPU to finish the execution of tasks.
//! INPUT: task -- Pointer to CmTaskInternal object
//! OUTPUT:
//!     CM_SUCCESS if all tasks in the queue are submitted
//!     CM_FAILURE otherwise.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::FlushGroupTask(CmTaskInternal* task)
{
    CM_RETURN_CODE  hr          = CM_SUCCESS;

    CM_HAL_EXEC_TASK_GROUP_PARAM param;
    CmKernelData* kernelData   = nullptr;
    uint32_t kernelDataSize        = 0;
    uint32_t count                  = 0;
    PCM_CONTEXT_DATA cmData    = nullptr;
    CmEventRT * event          = nullptr;
    PCM_HAL_KERNEL_PARAM tempData  = nullptr;

    CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_TASK_GROUP_PARAM ) );

    //GT-PIN
    if(this->m_device->CheckGTPinEnabled())
    {
        CM_CHK_CMSTATUS_GOTOFINISH(task->GetKernelSurfInfo(param.surEntryInfoArrays));
    }

    task->GetKernelCount( count );
    param.numKernels = count;

    param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count);
    param.kernelSizes = MOS_NewArray(uint32_t, count);
    param.kernelCurbeOffset = MOS_NewArray(uint32_t, count);
    param.queueOption = m_queueOption;

    CmSafeMemCopy(&param.taskConfig, task->GetTaskConfig(), sizeof(param.taskConfig));
    CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels);
    CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes);
    CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset);

    for( uint32_t i = 0; i < count; i ++ )
    {
        task->GetKernelData( i, kernelData );
        CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);

        task->GetKernelDataSize( i, kernelDataSize );
        if( kernelDataSize == 0)
        {
            CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
            hr = CM_FAILURE;
            goto finish;
        }

        tempData = kernelData->GetHalCmKernelData( );

        param.kernels[ i ]             = tempData;
        param.kernelSizes[ i ]        = kernelDataSize;
        param.kernelCurbeOffset [ i ] = task->GetKernelCurbeOffset(i);
        param.globalSurfaceUsed        |= tempData->globalSurfaceUsed;
        param.kernelDebugEnabled       |= tempData->kernelDebugEnabled;
    }

    task->GetSLMSize(param.slmSize);
    if(param.slmSize > MAX_SLM_SIZE_PER_GROUP_IN_1K)
    {
        CM_ASSERTMESSAGE("Error: SLM size exceeds the maximum per group.");
        hr = CM_EXCEED_MAX_SLM_SIZE;
        goto finish;
    }

    if (task->IsThreadGroupSpaceCreated())//thread group size
    {
        task->GetThreadGroupSpaceSize(param.threadSpaceWidth, param.threadSpaceHeight,
                                      param.threadSpaceDepth, param.groupSpaceWidth,
                                      param.groupSpaceHeight, param.groupSpaceDepth);
    }

    param.syncBitmap = task->GetSyncBitmap();
    param.conditionalEndBitmap = task->GetConditionalEndBitmap();
    param.userDefinedMediaState = task->GetMediaStatePtr();
    CmSafeMemCopy(param.conditionalEndInfo, task->GetConditionalEndInfo(), sizeof(param.conditionalEndInfo));
    CmSafeMemCopy(param.krnExecCfg, task->GetKernelExecuteConfig(), sizeof(param.krnExecCfg));

    // Call HAL layer to execute pfnExecuteGroupTask
    cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();

    CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnSetPowerOption( cmData->cmHalState, task->GetPowerOption() ) );

    m_device->RegisterSyncEvent(nullptr);
    CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnExecuteGroupTask( cmData->cmHalState, &param ) );

    if( param.taskIdOut < 0 )
    {
        CM_ASSERTMESSAGE("Error: Invalid task ID.");
        hr = CM_FAILURE;
        goto finish;
    }
    TASK_LOG(task);
    task->GetTaskEvent( event );
    CM_CHK_NULL_GOTOFINISH_CMERROR( event );
    CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
    CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
    CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());

    //GT-PIN
    if(this->m_device->CheckGTPinEnabled())
    {
        CM_CHK_CMSTATUS_GOTOFINISH(event->SetSurfaceDetails(param.surEntryInfoArrays));
    }

finish:
    MosSafeDeleteArray( param.kernels );
    MosSafeDeleteArray( param.kernelSizes );
    MosSafeDeleteArray( param.kernelCurbeOffset);

    return hr;
}

//*-----------------------------------------------------------------------------
//! Flush a VEBOX task to HAL CM layer for execution.
//! This is a non-blocking call. i.e. it returs immediately without waiting for
//! GPU to finish the execution of tasks.
//! INPUT: task -- Pointer to CmTaskInternal object
//! OUTPUT:
//!     CM_SUCCESS if all tasks in the queue are submitted
//!     CM_FAILURE otherwise.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::FlushVeboxTask(CmTaskInternal* task)
{
    CM_RETURN_CODE  hr          = CM_SUCCESS;

    CM_HAL_EXEC_VEBOX_TASK_PARAM param;
    PCM_CONTEXT_DATA cmData    = nullptr;
    CmEventRT * event          = nullptr;
    uint8_t *stateData           = nullptr;
    uint8_t *surfaceData         = nullptr;
    CmBuffer_RT * temp          = nullptr;

    CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_VEBOX_TASK_PARAM ) );

    //Set VEBOX state data pointer and size
    //Set VEBOX surface data pointer and size
    CM_VEBOX_STATE cmVeboxState;
    CmBufferUP *veboxParamBuf = nullptr;
    CM_VEBOX_SURFACE_DATA cmVeboxSurfaceData;
    task->GetVeboxState(cmVeboxState);
    task->GetVeboxParam(veboxParamBuf);
    task->GetVeboxSurfaceData(cmVeboxSurfaceData);
    CM_CHK_NULL_GOTOFINISH_CMERROR(veboxParamBuf);

    temp = static_cast<CmBuffer_RT*>(veboxParamBuf);
    temp->GetHandle(param.veboxParamIndex);

    param.cmVeboxState = cmVeboxState;
    param.veboxParam = veboxParamBuf;

    param.veboxSurfaceData = cmVeboxSurfaceData;

    param.queueOption = m_queueOption;

    //Set VEBOX task id to -1
    param.taskIdOut = -1;

    cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
    m_device->RegisterSyncEvent(nullptr);
    CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR( cmData->cmHalState->pfnExecuteVeboxTask( cmData->cmHalState, &param ) );

    if( param.taskIdOut < 0 )
    {
        CM_ASSERTMESSAGE("Error: Invalid task ID.");
        hr = CM_FAILURE;
        goto finish;
    }

    task->GetTaskEvent( event );
    CM_CHK_NULL_GOTOFINISH_CMERROR( event );
    CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
    CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));

finish:
    return hr;
}

//*-----------------------------------------------------------------------------
//! Flush the queue, i.e. submit all tasks in the queue to execute according
//! to their order in the the queue. The queue will be empty after flush,
//! This is a non-blocking call. i.e. it returns immediately without waiting for
//! GPU to finish the execution of tasks.
//! INPUT:
//! OUTPUT:
//!     CM_SUCCESS if all tasks in the queue are submitted
//!     CM_FAILURE otherwise.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::FlushEnqueueWithHintsTask( CmTaskInternal* task )
{
    CM_RETURN_CODE               hr             = CM_SUCCESS;
    CM_HAL_EXEC_HINTS_TASK_PARAM param;
    PCM_CONTEXT_DATA             cmData        = nullptr;
    CmKernelData*                kernelData    = nullptr;
    uint32_t                     kernelDataSize = 0;
    uint32_t                     count          = 0;
    CmEventRT                    *event        = nullptr;
    PCM_HAL_KERNEL_PARAM         tempData      = nullptr;

    CmSafeMemSet( &param, 0, sizeof( CM_HAL_EXEC_HINTS_TASK_PARAM ) );

    task->GetKernelCount ( count );
    param.numKernels = count;

    param.kernels = MOS_NewArray(PCM_HAL_KERNEL_PARAM, count);
    param.kernelSizes = MOS_NewArray(uint32_t, count);
    param.kernelCurbeOffset = MOS_NewArray(uint32_t, count);
    param.queueOption = m_queueOption;

    CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernels);
    CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelSizes);
    CM_CHK_NULL_GOTOFINISH_CMERROR(param.kernelCurbeOffset);

    task->GetHints(param.hints);
    task->GetNumTasksGenerated(param.numTasksGenerated);
    task->GetLastTask(param.isLastTask);

    for( uint32_t i = 0; i < count; i ++ )
    {
        task->GetKernelData( i, kernelData );
        CM_CHK_NULL_GOTOFINISH_CMERROR( kernelData );

        task->GetKernelDataSize( i, kernelDataSize );
        if( kernelDataSize == 0 )
        {
            CM_ASSERTMESSAGE("Error: Invalid kernel data size.");
            hr = CM_FAILURE;
            goto finish;
        }

        tempData = kernelData->GetHalCmKernelData();

        param.kernels[ i ]             = tempData;
        param.kernelSizes[ i ]         = kernelDataSize;
        param.kernelCurbeOffset[ i ]   = task->GetKernelCurbeOffset(i);
    }

    param.userDefinedMediaState = task->GetMediaStatePtr();
    cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
    CM_CHK_NULL_GOTOFINISH_CMERROR(cmData);

    CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnSetPowerOption(cmData->cmHalState, task->GetPowerOption()));

    m_device->RegisterSyncEvent(nullptr);
    CM_CHK_MOSSTATUS_GOTOFINISH_CMERROR(cmData->cmHalState->pfnExecuteHintsTask(cmData->cmHalState, &param));

    if( param.taskIdOut < 0 )
    {
        CM_ASSERTMESSAGE("Error: Invalid task ID.");
        hr = CM_FAILURE;
        goto finish;
    }

    TASK_LOG(task);

    task->GetTaskEvent( event );
    CM_CHK_NULL_GOTOFINISH_CMERROR( event );
    CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskDriverId( param.taskIdOut ));
    CM_CHK_CMSTATUS_GOTOFINISH(event->SetTaskOsData( param.osData ));
    CM_CHK_CMSTATUS_GOTOFINISH(task->ResetKernelDataStatus());

finish:

    MosSafeDeleteArray( param.kernels );
    MosSafeDeleteArray( param.kernelSizes );
    MosSafeDeleteArray( param.kernelCurbeOffset );

    return hr;
}

//*-----------------------------------------------------------------------------
//! Flush the queue, i.e. submit all tasks in the queue to execute according
//! to their order in the the queue. The queue will be empty after flush,
//! This is a non-blocking call. i.e. it returs immediately without waiting for
//! GPU to finish the execution of tasks.
//! INPUT:
//! OUTPUT:
//!     CM_SUCCESS if all tasks in the queue are submitted
//!     CM_FAILURE otherwise.
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::FlushTaskWithoutSync( bool flushBlocked )
{
    int32_t             hr          = CM_SUCCESS;
    CmTaskInternal*     task       = nullptr;
    uint32_t            taskType  = CM_TASK_TYPE_DEFAULT;
    uint32_t            freeSurfNum = 0;
    CmSurfaceManager*   surfaceMgr = nullptr;
    CSync*              surfaceLock = nullptr;

    m_criticalSectionHalExecute.Acquire(); // Enter HalCm Execute Protection

    while( !m_enqueuedTasks.IsEmpty() )
    {
        uint32_t flushedTaskCount = m_flushedTasks.GetCount();
        if ( flushBlocked )
        {
            while( flushedTaskCount >= m_halMaxValues->maxTasks )
            {
                // If the task count in flushed queue is no less than hw restrictiion,
                // query the staus of flushed task queue. Remove any finished tasks from the queue
                QueryFlushedTasks();
                flushedTaskCount = m_flushedTasks.GetCount();
            }
        }
        else
        {
            if( flushedTaskCount >= m_halMaxValues->maxTasks )
            {
                // If the task count in flushed queue is no less than hw restrictiion,
                // query the staus of flushed task queue. Remove any finished tasks from the queue
                QueryFlushedTasks();
                flushedTaskCount = m_flushedTasks.GetCount();
                if( flushedTaskCount >= m_halMaxValues->maxTasks )
                {
                    // If none of flushed tasks finishes, we can't flush more taks.
                    break;
                }
            }
        }

        task = (CmTaskInternal*)m_enqueuedTasks.Pop();
        CM_CHK_NULL_GOTOFINISH_CMERROR( task );

        CmNotifierGroup *notifiers = m_device->GetNotifiers();
        if (notifiers != nullptr)
        {
            notifiers->NotifyTaskFlushed(m_device, task);
        }

        task->GetTaskType(taskType);

        switch(taskType)
        {
            case CM_INTERNAL_TASK_WITH_THREADSPACE:
                hr = FlushGeneralTask(task);
                break;

            case CM_INTERNAL_TASK_WITH_THREADGROUPSPACE:
                hr = FlushGroupTask(task);
                break;

            case CM_INTERNAL_TASK_VEBOX:
                hr = FlushVeboxTask(task);
                break;

            case CM_INTERNAL_TASK_ENQUEUEWITHHINTS:
                hr = FlushEnqueueWithHintsTask(task);
                break;

            default:    // by default, assume the task is considered as general task: CM_INTERNAL_TASK_WITH_THREADSPACE
                hr = FlushGeneralTask(task);
                break;
        }

        if(hr == CM_SUCCESS)
        {
            m_flushedTasks.Push( task );
            task->VtuneSetFlushTime(); // Record Flush Time
        }
        else
        {
            // Failed to flush, destroy the task.
            CmTaskInternal::Destroy( task );
        }

    } // loop for task

    QueryFlushedTasks();

finish:
    m_criticalSectionHalExecute.Release();//Leave HalCm Execute Protection

    //Delayed destroy for resource
    m_device->GetSurfaceManager(surfaceMgr);
    if (!surfaceMgr)
    {
        CM_ASSERTMESSAGE("Error: Pointer to surface manager is null.");
        return CM_NULL_POINTER;
    }

    surfaceLock = m_device->GetSurfaceCreationLock();
    if (surfaceLock == nullptr)
    {
        CM_ASSERTMESSAGE("Error: Pointer to surface creation lock is null.");
        return CM_NULL_POINTER;
    }
    surfaceLock->Acquire();
    surfaceMgr->RefreshDelayDestroySurfaces(freeSurfNum);
    surfaceLock->Release();

    return hr;
}

//*-----------------------------------------------------------------------------
//| Purpose:    Enqueue a Vebox Task
//| Arguments :
//|               pVebox_G75      [in]       Pointer to a CmVebox object
//|               event          [in]       Reference to the pointer to Event
//|
//| Returns:    Result of the operation.
//*-----------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::EnqueueVebox(CmVebox * vebox, CmEvent* & event)
{
    INSERT_API_CALL_LOG();

    int32_t hr                  = CM_SUCCESS;
    CmTaskInternal* task   = nullptr;
    int32_t taskDriverId        = -1;
    bool isEventVisible    = (event == CM_NO_EVENT)? false:true;
    CmEventRT *eventRT = static_cast<CmEventRT *>(event);

    //Check if the input is valid
    if ( vebox == nullptr )
    {
        CM_ASSERTMESSAGE("Error: Pointer to vebox is null.");
        return CM_NULL_POINTER;
    }
    CmVeboxRT *veboxRT = static_cast<CmVeboxRT *>(vebox);
    CM_CHK_CMSTATUS_GOTOFINISH(CmTaskInternal::Create(m_device,  veboxRT, task ));

    LARGE_INTEGER nEnqueueTime;
    if ( !(MOS_QueryPerformanceCounter( (uint64_t*)&nEnqueueTime.QuadPart )) )
    {
        CM_ASSERTMESSAGE("Error: Query Performance counter failure.");
        return CM_FAILURE;
    }

    CM_CHK_CMSTATUS_GOTOFINISH(CreateEvent(task, isEventVisible, taskDriverId, eventRT));

    if ( eventRT != nullptr )
    {
        eventRT->SetEnqueueTime( nEnqueueTime );
    }
    event = eventRT;

    if (!m_enqueuedTasks.Push(task))
    {
        CM_ASSERTMESSAGE("Error: Push enqueued tasks failure.")
        hr = CM_FAILURE;
        goto finish;
    }

    CM_CHK_CMSTATUS_GOTOFINISH(FlushTaskWithoutSync());

finish:

    return hr;
}

//*-----------------------------------------------------------------------------
//| Purpose:   Create Event and Update event in m_eventArray
//| Returns:   result of operation
//*-----------------------------------------------------------------------------
int32_t CmQueueRT::CreateEvent(CmTaskInternal *task, bool isVisible, int32_t &taskDriverId, CmEventRT *&event )
{
    int32_t hr = CM_SUCCESS;

    m_criticalSectionEvent.Acquire();

    uint32_t freeSlotInEventArray = m_eventArray.GetFirstFreeIndex();

    hr = CmEventRT::Create( freeSlotInEventArray, this, task, taskDriverId, m_device, isVisible, event );

    if (hr == CM_SUCCESS)
    {

        m_eventArray.SetElement( freeSlotInEventArray, event );
        m_eventCount ++;

        task->SetTaskEvent( event );

        if(!isVisible)
        {
            event = nullptr;
        }

    }
    else
    {
        CM_ASSERTMESSAGE("Error: Create Event failure.")
    }

    m_criticalSectionEvent.Release();

    return hr;
}

//*---------------------------------------------------------------------------------------------------------
//| Name:       EnqueueCopyCPUToGPUFullStride()
//| Purpose:    Copy data from system memory to video memory (surface)
//| Arguments:
//|             surface      [in]  Pointer to a CmSurface2D object as copy destination
//|             sysMem       [in]  Pointer to a system memory as copy source
//|             widthStride   [in]  Width stride in bytes for system memory (to calculate start of next line)
//|             heightStride  [in]  Width stride in row for system memory (to calculate start of next plane)
//|             option        [in]  Option passed from user, blocking copy, non-blocking copy or disable turbo boost
//|             event        [in,out]  Reference to the pointer to Event
//| Returns:    Result of the operation.
//|
//| Restrictions & Notes:
//|             1) sysMem must be 16-byte aligned.
//|             2) Surface's width must be 16-byte aligned regarding performance.
//|             3) widthStride and heightStride are used to indicate the padding information in system memory
//|                 widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes
//|                 heightStride = height + padding_in_row
//*---------------------------------------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::EnqueueCopyCPUToGPUFullStride( CmSurface2D* surface,
                                                     const unsigned char* sysMem,
                                                     const uint32_t widthStride,
                                                     const uint32_t heightStride,
                                                     const uint32_t option,
                                                     CmEvent* & event )
{
    INSERT_API_CALL_LOG();

    if (!m_device->HasGpuCopyKernel())
    {
        return CM_NOT_IMPLEMENTED;
    }

    CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
    return EnqueueCopyInternal(surfaceRT, (unsigned char*)sysMem, widthStride, heightStride, CM_FASTCOPY_CPU2GPU, option, event);
}

//*---------------------------------------------------------------------------------------------------------
//| Name:       EnqueueCopyGPUToCPUFullStride()
//| Purpose:    Copy data from tiled video memory (surface) to linear system memory
//| Arguments:
//|             surface      [in]  Pointer to a CmSurface2D object as copy source
//|             sysMem       [in]  Pointer to a system memory as copy destination
//|             widthStride   [in]  Width stride in bytes for system memory (to calculate start of next line)
//|             heightStride  [in]  Width stride in row for system memory (to calculate start of next plane)
//|             option        [in]  Option passed from user, blocking copy,non-blocking copy or disable turbo boost
//|             event        [in,out]  Reference to the pointer to Event
//| Returns:    Result of the operation.
//|
//| Restrictions & Notes:
//|             1) sysMem must be 16-byte aligned.
//|             2) Surface's width must be 16-byte aligned regarding performance.
//|             3) widthStride and heightStride are used to indicate the padding information in system memory
//|                 widthStride = width_in_pixel * bytes_per_pixel + padding_in_bytes
//|                 heightStride = height + padding_in_row
//*---------------------------------------------------------------------------------------------------------
CM_RT_API int32_t CmQueueRT::EnqueueCopyGPUToCPUFullStride( CmSurface2D* surface,
                                                     unsigned char* sysMem,
                                                     const uint32_t widthStride,
                                                     const uint32_t heightStride,
                                                     const uint32_t option,
                                                     CmEvent* & event )
{
    INSERT_API_CALL_LOG();

    if (!m_device->HasGpuCopyKernel())
    {
        return CM_NOT_IMPLEMENTED;
    }

    CmSurface2DRT *surfaceRT = static_cast<CmSurface2DRT *>(surface);
    return EnqueueCopyInternal(surfaceRT, sysMem, widthStride, heightStride, CM_FASTCOPY_GPU2CPU, option, event);
}

//*---------------------------------------------------------------------------------------------------------
//| Name:       CreateGPUCopyKernel()
//| Purpose:    Create GPUCopy kernel, reuse the kernel if it has been created and resuable
//| Arguments:
//|             widthInByte      [in]  surface's width in bytes
//|             height           [in]  surface's height
//|             format           [in]  surface's height
//|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
//|             gpuCopyKernelParam [out] kernel param
//|
//| Returns:    Result of the operation.
//|
//*---------------------------------------------------------------------------------------------------------
int32_t CmQueueRT::CreateGPUCopyKernel(uint32_t widthInByte,
                                       uint32_t height,
                                       CM_SURFACE_FORMAT format,
                                       CM_GPUCOPY_DIRECTION copyDirection,
                                       CM_GPUCOPY_KERNEL* &gpuCopyKernelParam)
{
    int32_t     hr                 = CM_SUCCESS;

    //Search existing kernel
    CM_CHK_CMSTATUS_GOTOFINISH(SearchGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam));

    if(gpuCopyKernelParam != nullptr)
    { // reuse
        GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam);
    }
    else
    {
        gpuCopyKernelParam   = new (std::nothrow) CM_GPUCOPY_KERNEL ;
        CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyKernelParam);
        CmSafeMemSet(gpuCopyKernelParam, 0, sizeof(CM_GPUCOPY_KERNEL));

        CM_CHK_CMSTATUS_GOTOFINISH(AllocateGPUCopyKernel(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernel));
        CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, gpuCopyKernelParam->kernelID));
        GPUCOPY_KERNEL_LOCK(gpuCopyKernelParam);

        CM_CHK_CMSTATUS_GOTOFINISH(AddGPUCopyKernel(gpuCopyKernelParam));
    }

finish:
    if( hr != CM_SUCCESS)
    {
        CmSafeDelete(gpuCopyKernelParam);
    }

    return hr;
}

//*---------------------------------------------------------------------------------------------------------
//| Name:       SearchGPUCopyKernel()
//| Purpose:    Search if the required kernel exists
//| Arguments:
//|             widthInByte      [in]  surface's width in bytes
//|             height           [in]  surface's height
//|             format           [in]  surface's height
//|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
//|             gpuCopyKernelParam [out] kernel param
//|
//| Returns:    Result of the operation.
//|
//*---------------------------------------------------------------------------------------------------------
int32_t CmQueueRT::SearchGPUCopyKernel(uint32_t widthInByte,
                                       uint32_t height,
                                       CM_SURFACE_FORMAT format,
                                       CM_GPUCOPY_DIRECTION copyDirection,
                                       CM_GPUCOPY_KERNEL* &kernelParam)
{
    int32_t     hr = CM_SUCCESS;
    CM_GPUCOPY_KERNEL *gpucopyKernel = nullptr;
    CM_GPUCOPY_KERNEL_ID kernelTypeID = GPU_COPY_KERNEL_UNKNOWN;

    kernelParam = nullptr;
    CM_CHK_CMSTATUS_GOTOFINISH(GetGPUCopyKrnID(widthInByte, height, format, copyDirection, kernelTypeID));

    for(uint32_t index =0 ;  index< m_copyKernelParamArrayCount; index++)
    {
        gpucopyKernel = (CM_GPUCOPY_KERNEL*)m_copyKernelParamArray.GetElement(index);
        if(gpucopyKernel != nullptr)
        {
            if(!gpucopyKernel->locked &&
               gpucopyKernel->kernelID == kernelTypeID)
            {
                kernelParam = gpucopyKernel;
                break;
            }
        }
    }

finish:
    return hr;
}

//*---------------------------------------------------------------------------------------------------------
//| Name:       AddGPUCopyKernel()
//| Purpose:    Add new kernel into m_copyKernelParamArray
//| Arguments:
//|             widthInByte      [in]  surface's width in bytes
//|             height           [in]  surface's height
//|             format           [in]  surface's height
//|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
//|             gpuCopyKernelParam [out] kernel param
//|
//| Returns:    Result of the operation.
//|
//*---------------------------------------------------------------------------------------------------------
int32_t CmQueueRT::AddGPUCopyKernel(CM_GPUCOPY_KERNEL* &kernelParam)
{
    int32_t hr = CM_SUCCESS;
    // critical section protection
    CLock locker(m_criticalSectionGPUCopyKrn);

    CM_CHK_NULL_GOTOFINISH(kernelParam, CM_INVALID_GPUCOPY_KERNEL);

    // the newly created kernel must be locked
    if(!kernelParam->locked)
    {
        CM_ASSERTMESSAGE("Error: The newly created kernel must be locked.")
        hr = CM_INVALID_GPUCOPY_KERNEL;
        goto finish;
    }

    m_copyKernelParamArray.SetElement(m_copyKernelParamArrayCount, kernelParam);
    m_copyKernelParamArrayCount ++;

finish:
    return hr;
}

//*---------------------------------------------------------------------------------------------------------
//| Name:       GetGPUCopyKrnID()
//| Purpose:    Calculate the kernel ID accroding surface's width, height and copy direction
//| Arguments:
//|             widthInByte      [in]  surface's width in bytes
//|             height           [in]  surface's height
//|             format           [in]  surface's height
//|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
//|             kernelID         [out] kernel id
//|
//| Returns:    Result of the operation.
//|
//*---------------------------------------------------------------------------------------------------------
int32_t CmQueueRT::GetGPUCopyKrnID( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format,
            CM_GPUCOPY_DIRECTION copyDirection, CM_GPUCOPY_KERNEL_ID &kernelID )
{
    int32_t hr = CM_SUCCESS;

    kernelID = GPU_COPY_KERNEL_UNKNOWN;

    if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
    {
        switch(copyDirection)
        {
            case CM_FASTCOPY_GPU2CPU:
                if ( (height&0x7) ||(widthInByte&0x7f))
                {
                    kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_NV12_ID ;
                }
                else
                {   // height 8-row aligned, widthByte 128 multiple
                    kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_NV12_ID ;
                }
                break;

            case CM_FASTCOPY_CPU2GPU:
                kernelID = GPU_COPY_KERNEL_CPU2GPU_NV12_ID;
                break;

            case CM_FASTCOPY_GPU2GPU:
                kernelID = GPU_COPY_KERNEL_GPU2GPU_NV12_ID;
                break;

            case CM_FASTCOPY_CPU2CPU:
                kernelID = GPU_COPY_KERNEL_CPU2CPU_ID;
                break;

            default :
                CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
                hr = CM_FAILURE;
                break;
        }
    }
    else
    {
        switch(copyDirection)
        {
            case CM_FASTCOPY_GPU2CPU:
                if ( (height&0x7) ||(widthInByte&0x7f))
                {
                    kernelID = GPU_COPY_KERNEL_GPU2CPU_UNALIGNED_ID;
                }
                else
                {   // height 8-row aligned, widthByte 128 multiple
                    kernelID = GPU_COPY_KERNEL_GPU2CPU_ALIGNED_ID;
                }
                break;

            case CM_FASTCOPY_CPU2GPU:
                kernelID = GPU_COPY_KERNEL_CPU2GPU_ID;
                break;

            case CM_FASTCOPY_GPU2GPU:
                kernelID = GPU_COPY_KERNEL_GPU2GPU_ID;
                break;

            case CM_FASTCOPY_CPU2CPU:
                kernelID = GPU_COPY_KERNEL_CPU2CPU_ID;
                break;

            default :
                CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
                hr = CM_FAILURE;
                break;
        }
    }

    return hr;
}

//*---------------------------------------------------------------------------------------------------------
//| Name:       AllocateGPUCopyKernel()
//| Purpose:    Allocate GPUCopy Kernel
//| Arguments:
//|             widthInByte      [in]  surface's width in bytes
//|             height           [in]  surface's height
//|             format           [in]  surface's height
//|             copyDirection    [in]  copy direction, cpu -> gpu or gpu -> cpu
//|             kernel          [out] pointer to created kernel
//|
//| Returns:    Result of the operation.
//|
//*---------------------------------------------------------------------------------------------------------
int32_t CmQueueRT::AllocateGPUCopyKernel( uint32_t widthInByte, uint32_t height, CM_SURFACE_FORMAT format,
            CM_GPUCOPY_DIRECTION copyDirection, CmKernel *&kernel )
{
    int32_t          hr                 = CM_SUCCESS;
    CmProgram       *gpuCopyProgram    = nullptr;

    CM_CHK_CMSTATUS_GOTOFINISH( m_device->LoadPredefinedCopyKernel(gpuCopyProgram));
    CM_CHK_NULL_GOTOFINISH_CMERROR(gpuCopyProgram);

    if (format == CM_SURFACE_FORMAT_NV12 || format == CM_SURFACE_FORMAT_P010 || format == CM_SURFACE_FORMAT_P016)
    {
        switch(copyDirection)
        {
            case CM_FASTCOPY_GPU2CPU:
                if ( (height&0x7) ||(widthInByte&0x7f))
                {
                    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_32x32 ) , kernel,"PredefinedGPUCopyKernel"));
                }
                else
                {   // height 8-row aligned, widthByte 128 multiple
                    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_NV12_aligned_32x32 ) , kernel,"PredefinedGPUCopyKernel"));
                }
                break;

            case CM_FASTCOPY_CPU2GPU:
                CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_NV12_32x32 ), kernel, "PredefinedGPUCopyKernel"));
                break;

            case CM_FASTCOPY_GPU2GPU:
                CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_NV12_32x32), kernel, "PredefinedGPUCopyKernel"));
                break;

            case CM_FASTCOPY_CPU2CPU:
                CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel"));
                break;

            default :
                CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
                hr = CM_FAILURE;
                break;
        }
    }
    else
    {
        switch(copyDirection)
        {
            case CM_FASTCOPY_GPU2CPU:
                if ( (height&0x7) ||(widthInByte&0x7f))
                {
                    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_32x32 ) , kernel, "PredefinedGPUCopyKernel"));
                }
                else
                {   // height 8-row aligned, widthByte 128 multiple
                    CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_read_aligned_32x32  ) , kernel, "PredefinedGPUCopyKernel"));
                }
                break;

            case CM_FASTCOPY_CPU2GPU:
                CM_CHK_CMSTATUS_GOTOFINISH( m_device->CreateKernel( gpuCopyProgram, _NAME( surfaceCopy_write_32x32 ), kernel, "PredefinedGPUCopyKernel" ));
                break;

            case CM_FASTCOPY_GPU2GPU:
                CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_2DTo2D_32x32), kernel, "PredefinedGPUCopyKernel"));
                break;

            case CM_FASTCOPY_CPU2CPU:
                CM_CHK_CMSTATUS_GOTOFINISH(m_device->CreateKernel(gpuCopyProgram, _NAME(SurfaceCopy_BufferToBuffer_4k), kernel, "PredefinedGPUCopyKernel"));
                break;

            default :
                CM_ASSERTMESSAGE("Error: Invalid fast copy direction.")
                hr = CM_FAILURE;
                break;
        }
    }

finish:
    return hr;
}

CM_RT_API int32_t CmQueueRT::EnqueueFast(CmTask *task,
                              CmEvent *&event,
                              const CmThreadSpace *threadSpace)
{
    CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
    if (state == nullptr || state->advExecutor == nullptr)
    {
        return CM_NULL_POINTER;
    }
    else
    {
        const CmThreadSpaceRT *threadSpaceRTConst = static_cast<const CmThreadSpaceRT *>(threadSpace);
        if (state->cmHalInterface->CheckMediaModeAvailability() == false) 
        {
            if (threadSpaceRTConst != nullptr) 
            {
                return state->advExecutor->SubmitComputeTask(this, task, event, threadSpaceRTConst->GetThreadGroupSpace(), (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
            }
        }
        return state->advExecutor->SubmitTask(this, task, event, threadSpace, (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
    }
}

CM_RT_API int32_t CmQueueRT::DestroyEventFast(CmEvent *&event)
{
    CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;

    if (state == nullptr || state->advExecutor == nullptr)
    {
        return CM_NULL_POINTER;
    }
    else
    {
        return state->advExecutor->DestoryEvent(this, event);
    }
}

CM_RT_API int32_t CmQueueRT::EnqueueWithGroupFast(CmTask *task,
                                      CmEvent *&event,
                                      const CmThreadGroupSpace *threadGroupSpace)
{
    CM_HAL_STATE * state = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
    if (state == nullptr || state->advExecutor == nullptr)
    {
        return CM_NULL_POINTER;
    }

    return state->advExecutor->SubmitComputeTask(this, task, event, threadGroupSpace, (MOS_GPU_CONTEXT)m_queueOption.GPUContext);
}

}