PHengLEI-NCCR/API/DataStruct/include/PHOps.hxx

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//                                                                             +
//       Hybrid Platform for Engineering and Research of Flows (HyperFlow)     +
//                        面向流体工程与研究的混合CFD平台                      +
//                        （C）Zhang Laiping and He Xin                        +
//                    State Key Laboratory of Aerodynamics                     +
//                    Computational Aerodynamics Institute                     +
//            China Aerodynamics Research and Development Center               +
//                                 Since 2007                                  +
//                                                                             +
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// 文件头部说明：                                                              +
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// FILE:       XXX.cpp (XXX.h)                                                 +
// AUTHOR(S):  He Xin                                                          +
// PURPOSE:    simulation object                                               +
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// 函数头部说明：                                                              +
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// Function:   例如：Simulation::Simulation()                                  +
// AUTHOR(S):  He Xin                                                          +
// PURPOSE:    具体说明本模块的功能                                            +
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

namespace PHSPACE
{
//! Operator() functions:
template<typename T, int N>
const T & PHArray<T, N>::operator()(int i0) const
{
    //assertInRange(i0);
#ifdef USE_SMARTARRAY
    return datap1[i0];
#else
    return data_[i0 * stride_[0]];
#endif
}

template<typename T, int N>
T & PHArray<T, N>::operator()(int i0)
{
    //assertInRange(i0);
#ifdef USE_SMARTARRAY
    return datap1[i0];
#else
    return data_[i0 * stride_[0]];
#endif
}

template<typename T, int N>
const T & PHArray<T, N>::operator()(int i0, int i1) const
{
    //assertInRange(i0, i1);
#ifdef USE_SMARTARRAY
    return datap2[i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]];
#endif
}

template<typename T, int N>
T & PHArray<T, N>::operator()(int i0, int i1)
{
    //assertInRange(i0, i1);
#ifdef USE_SMARTARRAY
    return datap2[i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]];
#endif
}

template<typename T, int N>
const T & PHArray<T, N>::operator()(int i0, int i1, int i2) const
{
    //assertInRange(i0, i1, i2);
#ifdef USE_SMARTARRAY
    return datap3[i2][i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]
    + i2 * stride_[2]];
#endif
}

template<typename T, int N>
T & PHArray<T, N>::operator()(int i0, int i1, int i2)
{
    //assertInRange(i0, i1, i2);
#ifdef USE_SMARTARRAY
    return datap3[i2][i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]
    + i2 * stride_[2]];
#endif
}

template<typename T, int N>
const T & PHArray<T, N>::operator()(int i0, int i1, int i2, int i3) const
{
    //assertInRange(i0, i1, i2);
#ifdef USE_SMARTARRAY
    return datap4[i3][i2][i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]
    + i2 * stride_[2] + i3 * stride_[3]];
#endif
}

template<typename T, int N>
T & PHArray<T, N>::operator()(int i0, int i1, int i2, int i3)
{
    //assertInRange(i0, i1, i2);
#ifdef USE_SMARTARRAY
    return datap4[i3][i2][i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]
    + i2 * stride_[2] + i3 * stride_[3]];
#endif
}

template<typename T, int N>
const T & PHArray<T, N>::operator()(int i0, int i1, int i2, int i3, int i4) const
{
#ifdef USE_SMARTARRAY
    return datap5[i4][i3][i2][i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]
    + i2 * stride_[2] + i3 * stride_[3] + i4 * stride_[4]];
#endif
}

template<typename T, int N>
T & PHArray<T, N>::operator()(int i0, int i1, int i2, int i3, int i4)
{
#ifdef USE_SMARTARRAY
    return datap5[i4][i3][i2][i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]
    + i2 * stride_[2] + i3 * stride_[3] + i4 * stride_[4]];
#endif
}

//!ugks
template<typename T, int N>
const T & PHArray<T, N>::operator()(int i0, int i1, int i2, int i3, int i4, int i5) const
{
#ifdef USE_SMARTARRAY
    return datap6[i5][i4][i3][i2][i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]
    + i2 * stride_[2] + i3 * stride_[3] + i4 * stride_[4] + i5 * stride_[5]];
#endif
}
//!ugks
template<typename T, int N>
T & PHArray<T, N>::operator()(int i0, int i1, int i2, int i3, int i4, int i5)
{
#ifdef USE_SMARTARRAY
    return datap6[i5][i4][i3][i2][i1][i0];
#else
    return data_[i0 * stride_[0] + i1 * stride_[1]
    + i2 * stride_[2] + i3 * stride_[3] + i4 * stride_[4]+i5 * stride_[5]];
#endif
}

template<typename T, int N>
const T & PHArray<T, N>::operator[](int i) const
{
    return data_[i];
}

template<typename T, int N>
T & PHArray<T, N>::operator[](int i)
{
    return data_[i];
}

template<typename T, int N>
template<int N_rank2>
const T & PHArray<T, N>::operator()(const SimpleArray<int, N_rank2> &index) const
{
    return data_[dot(index, stride_)];
}

template<typename T, int N>
template<int N_rank2>
T & PHArray<T, N>::operator()(const SimpleArray<int, N_rank2> &index)
{
    return data_[dot(index, stride_)];
}

template<typename T, int N>
PHArray<T, N> PHArray<T, N>::operator()(Range r0) const
{
    return PHArray<T, N>(noConst(), r0);
}

template<typename T, int N>
PHArray<T, N> PHArray<T, N>::operator()(Range r0, Range r1) const
{
    return PHArray<T, N>(noConst(), r0, r1);
}

template<typename T, int N>
PHArray<T, N> PHArray<T, N>::operator()(Range r0, Range r1, Range r2) const
{
    return T_array(noConst(), r0, r1, r2);
}

template<typename T, int N>
PHArray<T, N> PHArray<T, N>::operator()(Range r0, Range r1, Range r2, Range r3) const
{
    return PHArray<T, N>(noConst(), r0, r1, r2, r3);
}

template<typename T, int N>
PHArray<T, N> PHArray<T, N>::operator()(Range r0, Range r1, Range r2, Range r3, Range r4) const
{
    return PHArray<T, N>(noConst(), r0, r1, r2, r3, r4);
}

template<typename T, int N>
PHArray<T, N> PHArray<T, N>::operator()(Range r0, Range r1, Range r2, Range r3, Range r4, Range r5) const
{
    return PHArray<T, N>(noConst(), r0, r1, r2, r3, r4, r5);
}

template<typename T, int N>
PHArray<T, N> PHArray<T, N>::operator()(Range r0, Range r1, Range r2, Range r3, Range r4, Range r5, Range r6) const
{
    return PHArray<T, N>(noConst(), r0, r1, r2, r3, r4, r5, r6);
}

template<typename T, int N>
template<typename T1, typename T2>
typename PHSliceInfo<T, T1, T2>::T_slice
    PHArray<T, N>::operator()(T1 r1, T2 r2) const
{
    typedef typename PHSliceInfo<T, T1, T2>::T_slice slice;
    return slice(noConst(), r1, r2, PHnilArraySection(), PHnilArraySection(), PHnilArraySection(),
                 PHnilArraySection(), PHnilArraySection(), PHnilArraySection(),
                 PHnilArraySection(), PHnilArraySection(), PHnilArraySection());
}

template<typename T, int N>
template<typename T1, typename T2, typename T3>
typename PHSliceInfo<T, T1, T2, T3>::T_slice
    PHArray<T, N>::operator()(T1 r1, T2 r2, T3 r3) const
{
    typedef typename PHSliceInfo<T, T1, T2, T3>::T_slice slice;
    return slice(noConst(), r1, r2, r3, PHnilArraySection(), PHnilArraySection(), PHnilArraySection(),
                 PHnilArraySection(), PHnilArraySection(), PHnilArraySection(),
                 PHnilArraySection(), PHnilArraySection());
}

template<typename T, int N>
template<typename T1, typename T2, typename T3, typename T4>
typename PHSliceInfo<T, T1, T2, T3, T4>::T_slice
    PHArray<T, N>::operator()(T1 r1, T2 r2, T3 r3, T4 r4) const
{
    typedef typename PHSliceInfo<T, T1, T2, T3, T4>::T_slice slice;
    return slice(noConst(), r1, r2, r3, r4, PHnilArraySection(), PHnilArraySection(),
                 PHnilArraySection(), PHnilArraySection(), PHnilArraySection(),
                 PHnilArraySection(), PHnilArraySection());
}

template<typename T, int N>
template<typename T1, typename T2, typename T3, typename T4, typename T5>
typename PHSliceInfo<T, T1, T2, T3, T4, T5>::T_slice
    PHArray<T, N>::operator()(T1 r1, T2 r2, T3 r3, T4 r4, T5 r5) const
{
    typedef typename PHSliceInfo<T, T1, T2, T3, T4, T5>::T_slice slice;
    return slice(noConst(), r1, r2, r3, r4, r5, PHnilArraySection(),
                 PHnilArraySection(), PHnilArraySection(), PHnilArraySection(),
                 PHnilArraySection(), PHnilArraySection());
}

template<typename T, int N>
template<typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
typename PHSliceInfo<T, T1, T2, T3, T4, T5, T6>::T_slice
    PHArray<T, N>::operator()(T1 r1, T2 r2, T3 r3, T4 r4, T5 r5, T6 r6) const
{
    typedef typename PHSliceInfo<T, T1, T2, T3, T4, T5, T6>::T_slice slice;
    return slice(noConst(), r1, r2, r3, r4, r5, r6, PHnilArraySection(), PHnilArraySection(), PHnilArraySection(),
                 PHnilArraySection(), PHnilArraySection());
}
#define PH_ARRAY_UPDATE(op, name)                                    \
template < typename P_numtype, int N_rank >                          \
template < typename T >                                              \
inline PHArray<P_numtype, N_rank>&                                   \
PHArray<P_numtype, N_rank>::operator op(const T &expr)               \
{                                                                    \
    evaluate(typename PHExprWrap<T>::T_expr(expr),                   \
      name<T_numtype, typename PHExprWrap<T>::T_expr::T_numtype>()); \
    return *this;                                                    \
}

PH_ARRAY_UPDATE(+=, fy_plus_update)
PH_ARRAY_UPDATE(-=, fy_minus_update)
PH_ARRAY_UPDATE(*=, fy_multiply_update)
PH_ARRAY_UPDATE(/=, fy_divide_update)

template < typename T_numtype, int N_rank > template < typename T_expr, typename T_update >
PHArray<T_numtype, N_rank>&
PHArray<T_numtype, N_rank>::evaluateWithIndexTraversal1(T_expr expr, T_update)
{
    SimpleArray<int, N_rank> index;

    if (stride(firstRank) == 1)
    {
        int first = lbound(firstRank);
        int last  = ubound(firstRank);

        T_numtype *iter = data_ + first;

        for (index[0] = first; index[0] <= last; ++ index[0])
        {
            T_update::update(*iter ++, expr(index));
            int kk = 1;
            int i = *iter;
            //T_update::update(*iter++,3);
            //PHUpdate<int,int>::update(kk,2);
            int kkk = 1;
        }
    }
    else
    {
        PHFastArrayIterator<T_numtype, N_rank> iter(*this);
        iter.loadStride(0);
        int first = lbound(firstRank);
        int last  = ubound(firstRank);

        for (index[0] = first; index[0] <= last; ++ index[0])
        {
            T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));
            iter.advance();
        }
    }

    return *this;
}

template < typename T_numtype, int N_rank > template < typename T_expr, typename T_update >
PHArray<T_numtype, N_rank>&
PHArray<T_numtype, N_rank>::evaluateWithIndexTraversalN(T_expr expr, T_update)
{
    // Do a stack-type traversal for the destination array and use
    // index traversal for the source expression

    const int maxRank = ordering(0);

    PHFastArrayIterator<T_numtype, N_rank> iter(*this);
    for (int i = 1; i < N_rank; ++ i)
    {
        iter.push(ordering(i));
    }

    iter.loadStride(maxRank);

    SimpleArray<int, N_rank> index, last;

    index = storage_.base();

    for (int i = 0; i < N_rank; ++ i)
    {
        last[i] = storage_.base(i) + length_[i];
    }

    // int lastLength = length(maxRank);

    while (true)
    {
        int a = base(maxRank);
        int b = last[maxRank];
        for (index[maxRank] = base(maxRank); index[maxRank] < last[maxRank]; ++ index[maxRank])
        {
            T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));
            iter.advance();
        }

        int j = 1;
        for (; j < N_rank; ++ j)
        {
            iter.pop(ordering(j));
            iter.loadStride(ordering(j));
            iter.advance();

            index[ordering(j-1)] = base(ordering(j-1));
            ++ index[ordering(j)];
            if (index[ordering(j)] != last[ordering(j)])
            {
                break;
            }
        }

        if (j == N_rank)
        {
            break;
        }

        for (; j > 0; -- j)
        {
            iter.push(ordering(j));
        }

        iter.loadStride(maxRank);
    }

    return *this;
}

template < typename T_numtype, int N_rank > template < typename T_expr, typename T_update >
PHArray<T_numtype, N_rank>&
PHArray<T_numtype, N_rank>::evaluateWithStackTraversal1(T_expr expr, T_update)
{
    PHFastArrayIterator<T_numtype, N_rank> iter(*this);
    iter.loadStride(firstRank);
    expr.loadStride(firstRank);

    const T_numtype *last = iter.data() + length(firstRank) * stride(firstRank);

    while (iter.data() != last)
    {
        T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
        iter.advance();
        expr.advance();
    }

    return *this;
}

template < typename T_numtype, int N_rank > template < typename T_expr, typename T_update >
PHArray<T_numtype, N_rank>&
PHArray<T_numtype, N_rank>::evaluateWithStackTraversalN(T_expr expr, T_update)
{
    /*
     * A stack traversal replaces the usual nested loops:
     *
     * for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
     *   for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
     *     for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
     *       A(i,j,k) = 0;
     *
     * with a stack data structure.  The stack allows this single
     * routine to replace any number of nested loops.
     *
     * For each dimension (loop), these quantities are needed:
     * - a pointer to the first element encountered in the loop
     * - the stride associated with the dimension/loop
     * - a pointer to the last element encountered in the loop
     *
     * The basic idea is that entering each loop is a "push" onto the
     * stack, and exiting each loop is a "pop".  In practice, this
     * routine treats accesses the stack in a random-access way,
     * which confuses the picture a bit.  But conceptually, that's
     * what is going on.
     */

    /*
     * ordering(0) gives the dimension associated with the smallest
     * stride (usually; the exceptions have to do with subarrays and
     * are uninteresting).  We call this dimension maxRank; it will
     * become the innermost "loop".
     *
     * Ordering the loops from ordering(N_rank-1) down to
     * ordering(0) ensures that the largest stride is associated
     * with the outermost loop, and the smallest stride with the
     * innermost.  This is critical for good performance on
     * cached machines.
     */

    const int maxRank = ordering(0);
    // const int secondLastRank = ordering(1);

    // Create an iterator for the array receiving the result
    PHFastArrayIterator<T_numtype, N_rank> iter(*this);

    // Set the initial stack configuration by pushing the pointer
    // to the first element of the array onto the stack N times.

    int i;
    for (i = 1; i < N_rank; ++ i)
    {
        iter.push(i);
        expr.push(i);
    }

    // Load the strides associated with the innermost loop.
    iter.loadStride(maxRank);
    expr.loadStride(maxRank);

    /*
     * Is the stride in the innermost loop equal to 1?  If so,
     * we might take advantage of this and generate more
     * efficient code.
     */
    bool useUnitStride = iter.isUnitStride(maxRank) && expr.isUnitStride(maxRank);

    /*
     * Do all array operands share a common stride in the innermost
     * loop?  If so, we can generate more efficient code (but only
     * if this optimization has been enabled).
     */
#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
    int commonStride = expr.suggestStride(maxRank);
    if (iter.suggestStride(maxRank) > commonStride)
    {
        commonStride = iter.suggestStride(maxRank);
    }
    bool useCommonStride = iter.isStride(maxRank, commonStride) && expr.isStride(maxRank, commonStride);
#else
    int commonStride = 1;
    bool useCommonStride = false;
#endif

    /*
     * The "last" array contains a pointer to the last element
     * encountered in each "loop".
     */
    const T_numtype *last[N_rank];

    // Set up the initial state of the "last" array
    for (i = 1; i < N_rank; ++ i)
    {
        last[i] = iter.data() + length(ordering(i)) * stride(ordering(i));
    }

    int lastLength = length(maxRank);
    int firstNoncollapsedLoop = 1;

    /*
     * This bit of code handles collapsing loops.  When possible,
     * the N nested loops are converted into a single loop (basically,
     * the N-dimensional array is treated as a long vector).
     * This is important for cases where the length of the innermost
     * loop is very small, for example a 100x100x3 array.
     * If this code can't collapse all the loops into a single loop,
     * it will collapse as many loops as possible starting from the
     * innermost and working out.
     */

    // Collapse loops when possible
    for (i = 1; i < N_rank; ++ i)
    {
        // Figure out which pair of loops we are considering combining.
        int outerLoopRank = ordering(i);
        int innerLoopRank = ordering(i-1);

        /*
         * The canCollapse() routines look at the strides and extents
         * of the loops, and determine if they can be combined into
         * one loop.
         */

        if (canCollapse(outerLoopRank, innerLoopRank) && expr.canCollapse(outerLoopRank, innerLoopRank))
        {
            lastLength *= length(outerLoopRank);
            firstNoncollapsedLoop = i+1;
        }
        else
        {
            break;
        }
    }

    /*
     * Now we actually perform the loops.  This while loop contains
     * two parts: first, the innermost loop is performed.  Then we
     * exit the loop, and pop our way down the stack until we find
     * a loop that isn't completed.  We then restart the inner loops
     * and push them onto the stack.
     */

    while (true)
    {

        /*
         * This bit of code handles the innermost loop.  It would look
         * a lot simpler if it weren't for unit stride and common stride
         * optimizations; these clutter up the code with multiple versions.
         */

        if ((useUnitStride) || (useCommonStride))
        {
            /*
             * The check for BZ_USE_FAST_READ_ARRAY_EXPR can probably
             * be taken out.  This was put in place while the unit stride/
             * common stride optimizations were being implemented and
             * tested.
             */

            // Compute the end of the innermost loop
            int ubound = lastLength * commonStride;

            /*
             * This is a real kludge.  I didn't want to have to write
             * a const and non-const version of FastArrayIterator, so I use a
             * const iterator and cast away const.  This could
             * probably be avoided with some trick, but the whole routine
             * is ugly, so why bother.
             */

            T_numtype *data = const_cast<T_numtype*>(iter.data());

            /*
             * BZ_NEEDS_WORK-- need to implement optional unrolling.
             */
            if (commonStride == 1)
            {
                for (i = 0; i < ubound; ++ i)
                {
                    T_update::update(*data++, expr.fastRead(i));
                }
            }
            else
            {
                for (i = 0; i != ubound; i += commonStride)
                {
                    T_update::update(data[i], expr.fastRead(i));
                }
            }
            /*
             * Tidy up for the fact that we haven't actually been
             * incrementing the iterators in the innermost loop, by
             * faking it afterward.
             */
            iter.advance(lastLength *commonStride);
            expr.advance(lastLength *commonStride);

        }
        else
        {
            /*
             * We don't have a unit stride or common stride in the innermost
             * loop.  This is going to hurt performance.  Luckily 95% of
             * the time, we hit the cases above.
             */
            T_numtype *end = const_cast<T_numtype*>(iter.data()) + lastLength * stride(maxRank);


            while (iter.data() != end)
            {
                T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
                iter.advance();
                expr.advance();
            }
        }

        /*
         * We just finished the innermost loop.  Now we pop our way down
         * the stack, until we hit a loop that hasn't completed yet.
         */
        int j = firstNoncollapsedLoop;
        for (; j < N_rank; ++ j)
        {
            // Get the next loop
            int r = ordering(j);

            // Pop-- this restores the data pointers to the first element
            // encountered in the loop.
            iter.pop(j);
            expr.pop(j);

            // Load the stride associated with this loop, and increment
            // once.
            iter.loadStride(r);
            expr.loadStride(r);
            iter.advance();
            expr.advance();

            // If we aren't at the end of this loop, then stop popping.
            if (iter.data() != last[j])
            {
                break;
            }
        }

        // Are we completely done?
        if (j == N_rank)
        {
            break;
        }

        // No, so push all the inner loops back onto the stack.
        for (; j >= firstNoncollapsedLoop; --j)
        {
            int r2 = ordering(j-1);
            iter.push(j);
            expr.push(j);
            last[j-1] = iter.data() + length(r2) * stride(r2);
        }

        // Load the stride for the innermost loop again.
        iter.loadStride(maxRank);
        expr.loadStride(maxRank);
    }

    return *this;
}

template < typename T_numtype, int N_rank > template < typename T_expr, typename T_update >
PHArray<T_numtype, N_rank>&
PHArray<T_numtype, N_rank>::evaluate(T_expr expr, T_update)
{
    // Check that all arrays have the same shape
    /*
     * Check that the arrays are not empty (e.g. length 0 arrays)
     * This fixes a bug found by Peter Bienstman, 6/16/99, where
     * Array<RDouble,2> A(0,0),B(0,0); B=A(tensor::j,tensor::i);
     * went into an infinite loop.
     */
    int N_rankTmp = N_rank;

    if (numElements() == 0)
    {
        return *this;
    }

    if (N_rankTmp == 1)
    {
        return evaluateWithStackTraversal1(expr, T_update());
    }
    else
    {
        return evaluateWithStackTraversalN(expr, T_update());
    }
}

template < typename T, int N >
PHArray<T, N> & PHArray<T, N>::operator=(T x)
{
    return initialize(x);
}

template < typename T, int N > template < typename T_expr >
PHArray<T, N>&
PHArray<T, N>::operator = (const PHETBase<T_expr> &expr)
{
    evaluate(expr.unwrap(), PHUpdate<T_numtype, typename T_expr::T_numtype>());

    return *this;
}

template < typename T, int N >
PHArray<T, N>&
PHArray<T, N>::operator = (const PHArray<T, N> &x)
{
    (*this) = PHArrayExpr< PHFastArrayIterator<T, N> >(x.beginFast());
    return *this;
}

template < typename T, int N >
PHArray<T, N>&
PHArray<T, N>::initialize(T x)
{
    (*this) = PHArrayExpr< PHArrayExprConstant<T > >(x);
    return *this;
}

}