blob: be9b5c6e8ddb1c0376bc8f4fb7f36bdc1dca3cce [file] [log] [blame]
/*
* SDL - Simple DirectMedia Layer
* CELL BE Support for PS3 Framebuffer
* Copyright (C) 2008, 2009 International Business Machines Corporation
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*
* Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com>
* Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
* SPE code based on research by:
* Rene Becker
* Thimo Emmerich
*/
#include "spu_common.h"
#include <spu_intrinsics.h>
#include <spu_mfcio.h>
// Debugging
//#define DEBUG
#ifdef DEBUG
#define deprintf(fmt, args... ) \
fprintf( stdout, fmt, ##args ); \
fflush( stdout );
#else
#define deprintf( fmt, args... )
#endif
struct scale_parms_t parms __attribute__((aligned(128)));
/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
* there might be the need to retrieve misaligned data, adjust
* incoming v and u plane to be able to handle this (add 128)
*/
unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
/* some vectors needed by the float to int conversion */
static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
void scale_srcw16_dstw16();
void scale_srcw16_dstw32();
void scale_srcw32_dstw16();
void scale_srcw32_dstw32();
int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
{
deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
/* DMA transfer for the input parameters */
spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
DMA_WAIT_TAG(TAG_INIT);
deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
parms.dst_pixel_width, parms.dst_pixel_height);
if(parms.src_pixel_width & 0x1f) {
if(parms.dst_pixel_width & 0x1F) {
deprintf("[SPU] Using scale_srcw16_dstw16\n");
scale_srcw16_dstw16();
} else {
deprintf("[SPU] Using scale_srcw16_dstw32\n");
scale_srcw16_dstw32();
}
} else {
if(parms.dst_pixel_width & 0x1F) {
deprintf("[SPU] Using scale_srcw32_dstw16\n");
scale_srcw32_dstw16();
} else {
deprintf("[SPU] Using scale_srcw32_dstw32\n");
scale_srcw32_dstw32();
}
}
deprintf("[SPU] bilin_scaler_spu... done!\n");
return 0;
}
/*
* vfloat_to_vuint()
*
* converts a float vector to an unsinged int vector using saturated
* arithmetic
*
* @param vec_s float vector for conversion
* @returns converted unsigned int vector
*/
inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
vec_s = spu_sel(vec_s, vec_0_1, select_1);
vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
vec_s = spu_sel(vec_s, vec_255, select_2);
return spu_convtu(vec_s,0);
}
/*
* scale_srcw16_dstw16()
*
* processes an input image of width 16
* scaling is done to a width 16
* result stored in RAM
*/
void scale_srcw16_dstw16() {
// extract parameters
unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
unsigned int src_width = parms.src_pixel_width;
unsigned int src_height = parms.src_pixel_height;
unsigned int dst_width = parms.dst_pixel_width;
unsigned int dst_height = parms.dst_pixel_height;
// YVU
unsigned int src_linestride_y = src_width;
unsigned int src_dbl_linestride_y = src_width<<1;
unsigned int src_linestride_vu = src_width>>1;
unsigned int src_dbl_linestride_vu = src_width;
// scaled YVU
unsigned int scaled_src_linestride_y = dst_width;
// ram addresses
unsigned char* src_addr_y = parms.y_plane;
unsigned char* src_addr_v = parms.v_plane;
unsigned char* src_addr_u = parms.u_plane;
// for handling misalignment, addresses are precalculated
unsigned char* precalc_src_addr_v = src_addr_v;
unsigned char* precalc_src_addr_u = src_addr_u;
unsigned int dst_picture_size = dst_width*dst_height;
// Sizes for destination
unsigned int dst_dbl_linestride_y = dst_width<<1;
unsigned int dst_dbl_linestride_vu = dst_width>>1;
// Perform address calculation for Y, V and U in main memory with dst_addr as base
unsigned char* dst_addr_main_memory_y = dst_addr;
unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
// calculate scale factors
vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
float y_scale = (float)src_height/(float)dst_height;
// double buffered processing
// buffer switching
unsigned int curr_src_idx = 0;
unsigned int curr_dst_idx = 0;
unsigned int next_src_idx, next_dst_idx;
// 2 lines y as output, upper and lowerline
unsigned int curr_interpl_y_upper = 0;
unsigned int next_interpl_y_upper;
unsigned int curr_interpl_y_lower, next_interpl_y_lower;
// only 1 line v/u output, both planes have the same dimension
unsigned int curr_interpl_vu = 0;
unsigned int next_interpl_vu;
// weights, calculated in every loop iteration
vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
vector float vf_next_NSweight_y_upper;
vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
vector float vf_next_NSweight_vu;
// line indices for the src picture
float curr_src_y_upper = 0.0f, next_src_y_upper;
float curr_src_y_lower, next_src_y_lower;
float curr_src_vu = 0.0f, next_src_vu;
// line indices for the dst picture
unsigned int dst_y=0, dst_vu=0;
// offset for the v and u plane to handle misalignement
unsigned int curr_lsoff_v = 0, next_lsoff_v;
unsigned int curr_lsoff_u = 0, next_lsoff_u;
// calculate lower line indices
curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
// lower line weight
vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
// start partially double buffered processing
// get initial data, 2 sets of y, 1 set v, 1 set u
mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF,
0, 0 );
mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
/* iteration loop
* within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
* the scaled output is 2 lines y, 1 line v, 1 line u
* the yuv2rgb-converted output is stored to RAM
*/
for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
dst_y = dst_vu<<1;
// calculate next indices
next_src_vu = ((float)dst_vu+1)*y_scale;
next_src_y_upper = ((float)dst_y+2)*y_scale;
next_src_y_lower = ((float)dst_y+3)*y_scale;
next_interpl_vu = (unsigned int) next_src_vu;
next_interpl_y_upper = (unsigned int) next_src_y_upper;
next_interpl_y_lower = (unsigned int) next_src_y_lower;
// calculate weight NORTH-SOUTH
vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
// get next lines
next_src_idx = curr_src_idx^1;
next_dst_idx = curr_dst_idx^1;
// 4 lines y
mfc_get( y_plane[next_src_idx],
(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF+next_src_idx,
0, 0 );
mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF+next_src_idx,
0, 0 );
// 2 lines v
precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
mfc_get( v_plane[next_src_idx],
((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
src_dbl_linestride_vu+(next_lsoff_v<<1),
RETR_BUF+next_src_idx,
0, 0 );
// 2 lines u
precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
mfc_get( u_plane[next_src_idx],
((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
src_dbl_linestride_vu+(next_lsoff_v<<1),
RETR_BUF+next_src_idx,
0, 0 );
DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
// scaling
// work line y_upper
bilinear_scale_line_w16( y_plane[curr_src_idx],
scaled_y_plane[curr_src_idx],
dst_width,
vf_x_scale,
vf_curr_NSweight_y_upper,
src_linestride_y );
// work line y_lower
bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
dst_width,
vf_x_scale,
vf_curr_NSweight_y_lower,
src_linestride_y );
// work line v
bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
scaled_v_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// work line u
bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
scaled_u_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// Store the result back to main memory into a destination buffer in YUV format
//---------------------------------------------------------------------------------------------
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
// Perform three DMA transfers to 3 different locations in the main memory!
// dst_width: Pixel width of destination image
// dst_addr: Destination address in main memory
// dst_vu: Counter which is incremented one by one
// dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
//---------------------------------------------------------------------------------------------
// update for next cycle
curr_src_idx = next_src_idx;
curr_dst_idx = next_dst_idx;
curr_interpl_y_upper = next_interpl_y_upper;
curr_interpl_y_lower = next_interpl_y_lower;
curr_interpl_vu = next_interpl_vu;
vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
vf_curr_NSweight_vu = vf_next_NSweight_vu;
curr_src_y_upper = next_src_y_upper;
curr_src_y_lower = next_src_y_lower;
curr_src_vu = next_src_vu;
curr_lsoff_v = next_lsoff_v;
curr_lsoff_u = next_lsoff_u;
}
DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
// scaling
// work line y_upper
bilinear_scale_line_w16( y_plane[curr_src_idx],
scaled_y_plane[curr_src_idx],
dst_width,
vf_x_scale,
vf_curr_NSweight_y_upper,
src_linestride_y );
// work line y_lower
bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
dst_width,
vf_x_scale,
vf_curr_NSweight_y_lower,
src_linestride_y );
// work line v
bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
scaled_v_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// work line u
bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
scaled_u_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// Store the result back to main memory into a destination buffer in YUV format
//---------------------------------------------------------------------------------------------
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
// Perform three DMA transfers to 3 different locations in the main memory!
// dst_width: Pixel width of destination image
// dst_addr: Destination address in main memory
// dst_vu: Counter which is incremented one by one
// dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
// wait for completion
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
//---------------------------------------------------------------------------------------------
}
/*
* scale_srcw16_dstw32()
*
* processes an input image of width 16
* scaling is done to a width 32
* yuv2rgb conversion on a width of 32
* result stored in RAM
*/
void scale_srcw16_dstw32() {
// extract parameters
unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
unsigned int src_width = parms.src_pixel_width;
unsigned int src_height = parms.src_pixel_height;
unsigned int dst_width = parms.dst_pixel_width;
unsigned int dst_height = parms.dst_pixel_height;
// YVU
unsigned int src_linestride_y = src_width;
unsigned int src_dbl_linestride_y = src_width<<1;
unsigned int src_linestride_vu = src_width>>1;
unsigned int src_dbl_linestride_vu = src_width;
// scaled YVU
unsigned int scaled_src_linestride_y = dst_width;
// ram addresses
unsigned char* src_addr_y = parms.y_plane;
unsigned char* src_addr_v = parms.v_plane;
unsigned char* src_addr_u = parms.u_plane;
unsigned int dst_picture_size = dst_width*dst_height;
// Sizes for destination
unsigned int dst_dbl_linestride_y = dst_width<<1;
unsigned int dst_dbl_linestride_vu = dst_width>>1;
// Perform address calculation for Y, V and U in main memory with dst_addr as base
unsigned char* dst_addr_main_memory_y = dst_addr;
unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
// for handling misalignment, addresses are precalculated
unsigned char* precalc_src_addr_v = src_addr_v;
unsigned char* precalc_src_addr_u = src_addr_u;
// calculate scale factors
vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
float y_scale = (float)src_height/(float)dst_height;
// double buffered processing
// buffer switching
unsigned int curr_src_idx = 0;
unsigned int curr_dst_idx = 0;
unsigned int next_src_idx, next_dst_idx;
// 2 lines y as output, upper and lowerline
unsigned int curr_interpl_y_upper = 0;
unsigned int next_interpl_y_upper;
unsigned int curr_interpl_y_lower, next_interpl_y_lower;
// only 1 line v/u output, both planes have the same dimension
unsigned int curr_interpl_vu = 0;
unsigned int next_interpl_vu;
// weights, calculated in every loop iteration
vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
vector float vf_next_NSweight_y_upper;
vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
vector float vf_next_NSweight_vu;
// line indices for the src picture
float curr_src_y_upper = 0.0f, next_src_y_upper;
float curr_src_y_lower, next_src_y_lower;
float curr_src_vu = 0.0f, next_src_vu;
// line indices for the dst picture
unsigned int dst_y=0, dst_vu=0;
// offset for the v and u plane to handle misalignement
unsigned int curr_lsoff_v = 0, next_lsoff_v;
unsigned int curr_lsoff_u = 0, next_lsoff_u;
// calculate lower line idices
curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
// lower line weight
vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
// start partially double buffered processing
// get initial data, 2 sets of y, 1 set v, 1 set u
mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF,
0, 0 );
mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
// iteration loop
// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
// the scaled output is 2 lines y, 1 line v, 1 line u
// the yuv2rgb-converted output is stored to RAM
for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
dst_y = dst_vu<<1;
// calculate next indices
next_src_vu = ((float)dst_vu+1)*y_scale;
next_src_y_upper = ((float)dst_y+2)*y_scale;
next_src_y_lower = ((float)dst_y+3)*y_scale;
next_interpl_vu = (unsigned int) next_src_vu;
next_interpl_y_upper = (unsigned int) next_src_y_upper;
next_interpl_y_lower = (unsigned int) next_src_y_lower;
// calculate weight NORTH-SOUTH
vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
// get next lines
next_src_idx = curr_src_idx^1;
next_dst_idx = curr_dst_idx^1;
// 4 lines y
mfc_get( y_plane[next_src_idx],
(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF+next_src_idx,
0, 0 );
mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF+next_src_idx,
0, 0 );
// 2 lines v
precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
mfc_get( v_plane[next_src_idx],
((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
src_dbl_linestride_vu+(next_lsoff_v<<1),
RETR_BUF+next_src_idx,
0, 0 );
// 2 lines u
precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
mfc_get( u_plane[next_src_idx],
((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
src_dbl_linestride_vu+(next_lsoff_v<<1),
RETR_BUF+next_src_idx,
0, 0 );
DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
// scaling
// work line y_upper
bilinear_scale_line_w16( y_plane[curr_src_idx],
scaled_y_plane[curr_src_idx],
dst_width,
vf_x_scale,
vf_curr_NSweight_y_upper,
src_linestride_y );
// work line y_lower
bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
dst_width,
vf_x_scale,
vf_curr_NSweight_y_lower,
src_linestride_y );
// work line v
bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
scaled_v_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// work line u
bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
scaled_u_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
//---------------------------------------------------------------------------------------------
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
// Perform three DMA transfers to 3 different locations in the main memory!
// dst_width: Pixel width of destination image
// dst_addr: Destination address in main memory
// dst_vu: Counter which is incremented one by one
// dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
//---------------------------------------------------------------------------------------------
// update for next cycle
curr_src_idx = next_src_idx;
curr_dst_idx = next_dst_idx;
curr_interpl_y_upper = next_interpl_y_upper;
curr_interpl_y_lower = next_interpl_y_lower;
curr_interpl_vu = next_interpl_vu;
vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
vf_curr_NSweight_vu = vf_next_NSweight_vu;
curr_src_y_upper = next_src_y_upper;
curr_src_y_lower = next_src_y_lower;
curr_src_vu = next_src_vu;
curr_lsoff_v = next_lsoff_v;
curr_lsoff_u = next_lsoff_u;
}
DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
// scaling
// work line y_upper
bilinear_scale_line_w16( y_plane[curr_src_idx],
scaled_y_plane[curr_src_idx],
dst_width,
vf_x_scale,
vf_curr_NSweight_y_upper,
src_linestride_y );
// work line y_lower
bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
dst_width,
vf_x_scale,
vf_curr_NSweight_y_lower,
src_linestride_y );
// work line v
bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
scaled_v_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// work line u
bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
scaled_u_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
//---------------------------------------------------------------------------------------------
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
// Perform three DMA transfers to 3 different locations in the main memory!
// dst_width: Pixel width of destination image
// dst_addr: Destination address in main memory
// dst_vu: Counter which is incremented one by one
// dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
// wait for completion
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
//---------------------------------------------------------------------------------------------
}
/*
* scale_srcw32_dstw16()
*
* processes an input image of width 32
* scaling is done to a width 16
* yuv2rgb conversion on a width of 16
* result stored in RAM
*/
void scale_srcw32_dstw16() {
// extract parameters
unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
unsigned int src_width = parms.src_pixel_width;
unsigned int src_height = parms.src_pixel_height;
unsigned int dst_width = parms.dst_pixel_width;
unsigned int dst_height = parms.dst_pixel_height;
// YVU
unsigned int src_linestride_y = src_width;
unsigned int src_dbl_linestride_y = src_width<<1;
unsigned int src_linestride_vu = src_width>>1;
unsigned int src_dbl_linestride_vu = src_width;
// scaled YVU
unsigned int scaled_src_linestride_y = dst_width;
// ram addresses
unsigned char* src_addr_y = parms.y_plane;
unsigned char* src_addr_v = parms.v_plane;
unsigned char* src_addr_u = parms.u_plane;
unsigned int dst_picture_size = dst_width*dst_height;
// Sizes for destination
unsigned int dst_dbl_linestride_y = dst_width<<1;
unsigned int dst_dbl_linestride_vu = dst_width>>1;
// Perform address calculation for Y, V and U in main memory with dst_addr as base
unsigned char* dst_addr_main_memory_y = dst_addr;
unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
// calculate scale factors
vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
float y_scale = (float)src_height/(float)dst_height;
// double buffered processing
// buffer switching
unsigned int curr_src_idx = 0;
unsigned int curr_dst_idx = 0;
unsigned int next_src_idx, next_dst_idx;
// 2 lines y as output, upper and lowerline
unsigned int curr_interpl_y_upper = 0;
unsigned int next_interpl_y_upper;
unsigned int curr_interpl_y_lower, next_interpl_y_lower;
// only 1 line v/u output, both planes have the same dimension
unsigned int curr_interpl_vu = 0;
unsigned int next_interpl_vu;
// weights, calculated in every loop iteration
vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
vector float vf_next_NSweight_y_upper;
vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
vector float vf_next_NSweight_vu;
// line indices for the src picture
float curr_src_y_upper = 0.0f, next_src_y_upper;
float curr_src_y_lower, next_src_y_lower;
float curr_src_vu = 0.0f, next_src_vu;
// line indices for the dst picture
unsigned int dst_y=0, dst_vu=0;
// calculate lower line idices
curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
// lower line weight
vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
// start partially double buffered processing
// get initial data, 2 sets of y, 1 set v, 1 set u
mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF,
0, 0 );
mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
// iteration loop
// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
// the scaled output is 2 lines y, 1 line v, 1 line u
// the yuv2rgb-converted output is stored to RAM
for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
dst_y = dst_vu<<1;
// calculate next indices
next_src_vu = ((float)dst_vu+1)*y_scale;
next_src_y_upper = ((float)dst_y+2)*y_scale;
next_src_y_lower = ((float)dst_y+3)*y_scale;
next_interpl_vu = (unsigned int) next_src_vu;
next_interpl_y_upper = (unsigned int) next_src_y_upper;
next_interpl_y_lower = (unsigned int) next_src_y_lower;
// calculate weight NORTH-SOUTH
vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
// get next lines
next_src_idx = curr_src_idx^1;
next_dst_idx = curr_dst_idx^1;
// 4 lines y
mfc_get( y_plane[next_src_idx],
(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF+next_src_idx,
0, 0 );
mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF+next_src_idx,
0, 0 );
// 2 lines v
mfc_get( v_plane[next_src_idx],
(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
src_dbl_linestride_vu,
RETR_BUF+next_src_idx,
0, 0 );
// 2 lines u
mfc_get( u_plane[next_src_idx],
(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
src_dbl_linestride_vu,
RETR_BUF+next_src_idx,
0, 0 );
DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
// scaling
// work line y_upper
bilinear_scale_line_w16( y_plane[curr_src_idx],
scaled_y_plane[curr_src_idx],
dst_width,
vf_x_scale,
vf_curr_NSweight_y_upper,
src_linestride_y );
// work line y_lower
bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
dst_width,
vf_x_scale,
vf_curr_NSweight_y_lower,
src_linestride_y );
// work line v
bilinear_scale_line_w16( v_plane[curr_src_idx],
scaled_v_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// work line u
bilinear_scale_line_w16( u_plane[curr_src_idx],
scaled_u_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
//---------------------------------------------------------------------------------------------
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
// Perform three DMA transfers to 3 different locations in the main memory!
// dst_width: Pixel width of destination image
// dst_addr: Destination address in main memory
// dst_vu: Counter which is incremented one by one
// dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
//---------------------------------------------------------------------------------------------
// update for next cycle
curr_src_idx = next_src_idx;
curr_dst_idx = next_dst_idx;
curr_interpl_y_upper = next_interpl_y_upper;
curr_interpl_y_lower = next_interpl_y_lower;
curr_interpl_vu = next_interpl_vu;
vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
vf_curr_NSweight_vu = vf_next_NSweight_vu;
curr_src_y_upper = next_src_y_upper;
curr_src_y_lower = next_src_y_lower;
curr_src_vu = next_src_vu;
}
DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
// scaling
// work line y_upper
bilinear_scale_line_w16( y_plane[curr_src_idx],
scaled_y_plane[curr_src_idx],
dst_width,
vf_x_scale,
vf_curr_NSweight_y_upper,
src_linestride_y );
// work line y_lower
bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
dst_width,
vf_x_scale,
vf_curr_NSweight_y_lower,
src_linestride_y );
// work line v
bilinear_scale_line_w16( v_plane[curr_src_idx],
scaled_v_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// work line u
bilinear_scale_line_w16( u_plane[curr_src_idx],
scaled_u_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
//---------------------------------------------------------------------------------------------
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
// Perform three DMA transfers to 3 different locations in the main memory!
// dst_width: Pixel width of destination image
// dst_addr: Destination address in main memory
// dst_vu: Counter which is incremented one by one
// dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
// wait for completion
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
//---------------------------------------------------------------------------------------------
}
/**
* scale_srcw32_dstw32()
*
* processes an input image of width 32
* scaling is done to a width 32
* yuv2rgb conversion on a width of 32
* result stored in RAM
*/
void scale_srcw32_dstw32() {
// extract parameters
unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
unsigned int src_width = parms.src_pixel_width;
unsigned int src_height = parms.src_pixel_height;
unsigned int dst_width = parms.dst_pixel_width;
unsigned int dst_height = parms.dst_pixel_height;
// YVU
unsigned int src_linestride_y = src_width;
unsigned int src_dbl_linestride_y = src_width<<1;
unsigned int src_linestride_vu = src_width>>1;
unsigned int src_dbl_linestride_vu = src_width;
// scaled YVU
unsigned int scaled_src_linestride_y = dst_width;
// ram addresses
unsigned char* src_addr_y = parms.y_plane;
unsigned char* src_addr_v = parms.v_plane;
unsigned char* src_addr_u = parms.u_plane;
unsigned int dst_picture_size = dst_width*dst_height;
// Sizes for destination
unsigned int dst_dbl_linestride_y = dst_width<<1;
unsigned int dst_dbl_linestride_vu = dst_width>>1;
// Perform address calculation for Y, V and U in main memory with dst_addr as base
unsigned char* dst_addr_main_memory_y = dst_addr;
unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
// calculate scale factors
vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
float y_scale = (float)src_height/(float)dst_height;
// double buffered processing
// buffer switching
unsigned int curr_src_idx = 0;
unsigned int curr_dst_idx = 0;
unsigned int next_src_idx, next_dst_idx;
// 2 lines y as output, upper and lowerline
unsigned int curr_interpl_y_upper = 0;
unsigned int next_interpl_y_upper;
unsigned int curr_interpl_y_lower, next_interpl_y_lower;
// only 1 line v/u output, both planes have the same dimension
unsigned int curr_interpl_vu = 0;
unsigned int next_interpl_vu;
// weights, calculated in every loop iteration
vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
vector float vf_next_NSweight_y_upper;
vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
vector float vf_next_NSweight_vu;
// line indices for the src picture
float curr_src_y_upper = 0.0f, next_src_y_upper;
float curr_src_y_lower, next_src_y_lower;
float curr_src_vu = 0.0f, next_src_vu;
// line indices for the dst picture
unsigned int dst_y=0, dst_vu=0;
// calculate lower line idices
curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
// lower line weight
vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
// start partially double buffered processing
// get initial data, 2 sets of y, 1 set v, 1 set u
mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF,
0, 0 );
mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
// iteration loop
// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
// the scaled output is 2 lines y, 1 line v, 1 line u
// the yuv2rgb-converted output is stored to RAM
for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
dst_y = dst_vu<<1;
// calculate next indices
next_src_vu = ((float)dst_vu+1)*y_scale;
next_src_y_upper = ((float)dst_y+2)*y_scale;
next_src_y_lower = ((float)dst_y+3)*y_scale;
next_interpl_vu = (unsigned int) next_src_vu;
next_interpl_y_upper = (unsigned int) next_src_y_upper;
next_interpl_y_lower = (unsigned int) next_src_y_lower;
// calculate weight NORTH-SOUTH
vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
// get next lines
next_src_idx = curr_src_idx^1;
next_dst_idx = curr_dst_idx^1;
// 4 lines y
mfc_get( y_plane[next_src_idx],
(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF+next_src_idx,
0, 0 );
mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
src_dbl_linestride_y,
RETR_BUF+next_src_idx,
0, 0 );
// 2 lines v
mfc_get( v_plane[next_src_idx],
(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
src_dbl_linestride_vu,
RETR_BUF+next_src_idx,
0, 0 );
// 2 lines u
mfc_get( u_plane[next_src_idx],
(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
src_dbl_linestride_vu,
RETR_BUF+next_src_idx,
0, 0 );
DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
// scaling
// work line y_upper
bilinear_scale_line_w16( y_plane[curr_src_idx],
scaled_y_plane[curr_src_idx],
dst_width,
vf_x_scale,
vf_curr_NSweight_y_upper,
src_linestride_y );
// work line y_lower
bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
dst_width,
vf_x_scale,
vf_curr_NSweight_y_lower,
src_linestride_y );
// work line v
bilinear_scale_line_w16( v_plane[curr_src_idx],
scaled_v_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// work line u
bilinear_scale_line_w16( u_plane[curr_src_idx],
scaled_u_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// Store the result back to main memory into a destination buffer in YUV format
//---------------------------------------------------------------------------------------------
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
// Perform three DMA transfers to 3 different locations in the main memory!
// dst_width: Pixel width of destination image
// dst_addr: Destination address in main memory
// dst_vu: Counter which is incremented one by one
// dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
//---------------------------------------------------------------------------------------------
// update for next cycle
curr_src_idx = next_src_idx;
curr_dst_idx = next_dst_idx;
curr_interpl_y_upper = next_interpl_y_upper;
curr_interpl_y_lower = next_interpl_y_lower;
curr_interpl_vu = next_interpl_vu;
vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
vf_curr_NSweight_vu = vf_next_NSweight_vu;
curr_src_y_upper = next_src_y_upper;
curr_src_y_lower = next_src_y_lower;
curr_src_vu = next_src_vu;
}
DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
// scaling
// work line y_upper
bilinear_scale_line_w16( y_plane[curr_src_idx],
scaled_y_plane[curr_src_idx],
dst_width,
vf_x_scale,
vf_curr_NSweight_y_upper,
src_linestride_y );
// work line y_lower
bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
dst_width,
vf_x_scale,
vf_curr_NSweight_y_lower,
src_linestride_y );
// work line v
bilinear_scale_line_w16( v_plane[curr_src_idx],
scaled_v_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// work line u
bilinear_scale_line_w16( u_plane[curr_src_idx],
scaled_u_plane[curr_src_idx],
dst_width>>1,
vf_x_scale,
vf_curr_NSweight_vu,
src_linestride_vu );
// Store the result back to main memory into a destination buffer in YUV format
//---------------------------------------------------------------------------------------------
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
// Perform three DMA transfers to 3 different locations in the main memory!
// dst_width: Pixel width of destination image
// dst_addr: Destination address in main memory
// dst_vu: Counter which is incremented one by one
// dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr)
dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr)
(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution)
STR_BUF+curr_dst_idx, // Tag
0, 0 );
// wait for completion
DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
//---------------------------------------------------------------------------------------------
}
/*
* bilinear_scale_line_w8()
*
* processes a line of yuv-input, width has to be a multiple of 8
* scaled yuv-output is written to local store buffer
*
* @param src buffer for 2 lines input
* @param dst_ buffer for 1 line output
* @param dst_width the width of the destination line
* @param vf_x_scale a float vector, at each entry is the x_scale-factor
* @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
* @param src_linestride the stride of the srcline
*/
void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
unsigned char* dst = dst_;
unsigned int dst_x;
for( dst_x=0; dst_x<dst_width; dst_x+=8) {
// address calculation for loading the 4 surrounding pixel of each calculated
// destination pixel
vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
// lower range->first 4 pixel
// upper range->next 4 pixel
vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
// calculate weight EAST-WEST
vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
// calculate address offset
//
// pixel NORTH WEST
vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
// pixel NORTH EAST-->(offpixelNW+1)
vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
// SOUTH-WEST-->(offpixelNW+src_linestride)
vector unsigned int vui_srclinestride = spu_splats( src_linestride );
vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
// SOUTH-EAST-->(offpixelNW+src_linestride+1)
vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
// calculate each address
vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
// get each pixel
//
// scalar load, afterwards insertion into the right position
// NORTH WEST
vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
vuc_pixel_NW_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
vuc_pixel_NW_lower_range, 7 );
vuc_pixel_NW_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
vuc_pixel_NW_lower_range, 11 );
vuc_pixel_NW_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
vuc_pixel_NW_lower_range, 15 );
vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
vuc_pixel_NW_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
vuc_pixel_NW_upper_range, 7 );
vuc_pixel_NW_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
vuc_pixel_NW_upper_range, 11 );
vuc_pixel_NW_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
vuc_pixel_NW_upper_range, 15 );
// NORTH EAST
vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
vuc_pixel_NE_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
vuc_pixel_NE_lower_range, 7 );
vuc_pixel_NE_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
vuc_pixel_NE_lower_range, 11 );
vuc_pixel_NE_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
vuc_pixel_NE_lower_range, 15 );
vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
vuc_pixel_NE_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
vuc_pixel_NE_upper_range, 7 );
vuc_pixel_NE_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
vuc_pixel_NE_upper_range, 11 );
vuc_pixel_NE_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
vuc_pixel_NE_upper_range, 15 );
// SOUTH WEST
vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
vuc_pixel_SW_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
vuc_pixel_SW_lower_range, 7 );
vuc_pixel_SW_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
vuc_pixel_SW_lower_range, 11 );
vuc_pixel_SW_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
vuc_pixel_SW_lower_range, 15 );
vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
vuc_pixel_SW_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
vuc_pixel_SW_upper_range, 7 );
vuc_pixel_SW_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
vuc_pixel_SW_upper_range, 11 );
vuc_pixel_SW_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
vuc_pixel_SW_upper_range, 15 );
// SOUTH EAST
vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
vuc_pixel_SE_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
vuc_pixel_SE_lower_range, 7 );
vuc_pixel_SE_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
vuc_pixel_SE_lower_range, 11 );
vuc_pixel_SE_lower_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
vuc_pixel_SE_lower_range, 15 );
vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
vuc_pixel_SE_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
vuc_pixel_SE_upper_range, 7 );
vuc_pixel_SE_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
vuc_pixel_SE_upper_range, 11 );
vuc_pixel_SE_upper_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
vuc_pixel_SE_upper_range, 15 );
// convert to float
vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
// first linear interpolation: EWtop
// EWtop = NW + EWweight*(NE-NW)
//
// lower range
vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
vf_EWtop_lower_range_tmp,
vf_pixel_NW_lower_range );
// upper range
vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
vf_EWtop_upper_range_tmp,
vf_pixel_NW_upper_range );
// second linear interpolation: EWbottom
// EWbottom = SW + EWweight*(SE-SW)
//
// lower range
vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
vf_EWbottom_lower_range_tmp,
vf_pixel_SW_lower_range );
// upper range
vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
vf_EWbottom_upper_range_tmp,
vf_pixel_SW_upper_range );
// third linear interpolation: the bilinear interpolated value
// result = EWtop + NSweight*(EWbottom-EWtop);
//
// lower range
vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
vector float vf_result_lower_range = spu_madd( vf_NSweight,
vf_result_lower_range_tmp,
vf_EWtop_lower_range );
// upper range
vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
vector float vf_result_upper_range = spu_madd( vf_NSweight,
vf_result_upper_range_tmp,
vf_EWtop_upper_range );
// convert back: using saturated arithmetic
vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
// merge results->lower,upper
vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
0x13, 0x17, 0x1B, 0x1F,
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00 };
vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
(vector unsigned char) vui_result_upper_range,
vuc_mask_merge_result );
// partial storing
vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF };
// get currently stored data
vector unsigned char vuc_orig = *((vector unsigned char*)dst);
// clear currently stored data
vuc_orig = spu_and( vuc_orig,
spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
// rotate result according to storing address
vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
// store result
*((vector unsigned char*)dst) = spu_or( vuc_result,
vuc_orig );
dst += 8;
}
}
/*
* bilinear_scale_line_w16()
*
* processes a line of yuv-input, width has to be a multiple of 16
* scaled yuv-output is written to local store buffer
*
* @param src buffer for 2 lines input
* @param dst_ buffer for 1 line output
* @param dst_width the width of the destination line
* @param vf_x_scale a float vector, at each entry is the x_scale-factor
* @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
* @param src_linestride the stride of the srcline
*/
void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
unsigned char* dst = dst_;
unsigned int dst_x;
for( dst_x=0; dst_x<dst_width; dst_x+=16) {
// address calculation for loading the 4 surrounding pixel of each calculated
// destination pixel
vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
// parallelised processing
// first range->pixel 1 2 3 4
// second range->pixel 5 6 7 8
// third range->pixel 9 10 11 12
// fourth range->pixel 13 14 15 16
vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
// calculate weight EAST-WEST
vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
// calculate address offset
//
// pixel NORTH WEST
vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
// pixel NORTH EAST-->(offpixelNW+1)
vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
// SOUTH-WEST-->(offpixelNW+src_linestride)
vector unsigned int vui_srclinestride = spu_splats( src_linestride );
vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
// SOUTH-EAST-->(offpixelNW+src_linestride+1)
vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
// calculate each address
vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
// get each pixel
//
// scalar load, afterwards insertion into the right position
// NORTH WEST
// first range
vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
vector unsigned char vuc_pixel_NW_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
vuc_pixel_NW_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
vuc_pixel_NW_first_range, 7 );
vuc_pixel_NW_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
vuc_pixel_NW_first_range, 11 );
vuc_pixel_NW_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
vuc_pixel_NW_first_range, 15 );
// second range
vector unsigned char vuc_pixel_NW_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
vuc_pixel_NW_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
vuc_pixel_NW_second_range, 7 );
vuc_pixel_NW_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
vuc_pixel_NW_second_range, 11 );
vuc_pixel_NW_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
vuc_pixel_NW_second_range, 15 );
// third range
vector unsigned char vuc_pixel_NW_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
vuc_pixel_NW_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
vuc_pixel_NW_third_range, 7 );
vuc_pixel_NW_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
vuc_pixel_NW_third_range, 11 );
vuc_pixel_NW_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
vuc_pixel_NW_third_range, 15 );
// fourth range
vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
vuc_pixel_NW_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
vuc_pixel_NW_fourth_range, 7 );
vuc_pixel_NW_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
vuc_pixel_NW_fourth_range, 11 );
vuc_pixel_NW_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
vuc_pixel_NW_fourth_range, 15 );
// NORTH EAST
// first range
vector unsigned char vuc_pixel_NE_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
vuc_pixel_NE_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
vuc_pixel_NE_first_range, 7 );
vuc_pixel_NE_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
vuc_pixel_NE_first_range, 11 );
vuc_pixel_NE_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
vuc_pixel_NE_first_range, 15 );
// second range
vector unsigned char vuc_pixel_NE_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
vuc_pixel_NE_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
vuc_pixel_NE_second_range, 7 );
vuc_pixel_NE_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
vuc_pixel_NE_second_range, 11 );
vuc_pixel_NE_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
vuc_pixel_NE_second_range, 15 );
// third range
vector unsigned char vuc_pixel_NE_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
vuc_pixel_NE_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
vuc_pixel_NE_third_range, 7 );
vuc_pixel_NE_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
vuc_pixel_NE_third_range, 11 );
vuc_pixel_NE_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
vuc_pixel_NE_third_range, 15 );
// fourth range
vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
vuc_pixel_NE_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
vuc_pixel_NE_fourth_range, 7 );
vuc_pixel_NE_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
vuc_pixel_NE_fourth_range, 11 );
vuc_pixel_NE_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
vuc_pixel_NE_fourth_range, 15 );
// SOUTH WEST
// first range
vector unsigned char vuc_pixel_SW_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
vuc_pixel_SW_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
vuc_pixel_SW_first_range, 7 );
vuc_pixel_SW_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
vuc_pixel_SW_first_range, 11 );
vuc_pixel_SW_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
vuc_pixel_SW_first_range, 15 );
// second range
vector unsigned char vuc_pixel_SW_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
vuc_pixel_SW_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
vuc_pixel_SW_second_range, 7 );
vuc_pixel_SW_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
vuc_pixel_SW_second_range, 11 );
vuc_pixel_SW_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
vuc_pixel_SW_second_range, 15 );
// third range
vector unsigned char vuc_pixel_SW_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
vuc_pixel_SW_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
vuc_pixel_SW_third_range, 7 );
vuc_pixel_SW_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
vuc_pixel_SW_third_range, 11 );
vuc_pixel_SW_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
vuc_pixel_SW_third_range, 15 );
// fourth range
vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
vuc_pixel_SW_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
vuc_pixel_SW_fourth_range, 7 );
vuc_pixel_SW_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
vuc_pixel_SW_fourth_range, 11 );
vuc_pixel_SW_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
vuc_pixel_SW_fourth_range, 15 );
// NORTH EAST
// first range
vector unsigned char vuc_pixel_SE_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
vuc_pixel_SE_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
vuc_pixel_SE_first_range, 7 );
vuc_pixel_SE_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
vuc_pixel_SE_first_range, 11 );
vuc_pixel_SE_first_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
vuc_pixel_SE_first_range, 15 );
// second range
vector unsigned char vuc_pixel_SE_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
vuc_pixel_SE_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
vuc_pixel_SE_second_range, 7 );
vuc_pixel_SE_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
vuc_pixel_SE_second_range, 11 );
vuc_pixel_SE_second_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
vuc_pixel_SE_second_range, 15 );
// third range
vector unsigned char vuc_pixel_SE_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
vuc_pixel_SE_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
vuc_pixel_SE_third_range, 7 );
vuc_pixel_SE_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
vuc_pixel_SE_third_range, 11 );
vuc_pixel_SE_third_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
vuc_pixel_SE_third_range, 15 );
// fourth range
vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
vuc_pixel_SE_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
vuc_pixel_SE_fourth_range, 7 );
vuc_pixel_SE_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
vuc_pixel_SE_fourth_range, 11 );
vuc_pixel_SE_fourth_range = spu_insert(
*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
vuc_pixel_SE_fourth_range, 15 );
// convert to float
vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
// first linear interpolation: EWtop
// EWtop = NW + EWweight*(NE-NW)
//
// first range
vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
vf_EWtop_first_range_tmp,
vf_pixel_NW_first_range );
// second range
vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
vf_EWtop_second_range_tmp,
vf_pixel_NW_second_range );
// third range
vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
vf_EWtop_third_range_tmp,
vf_pixel_NW_third_range );
// fourth range
vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
vf_EWtop_fourth_range_tmp,
vf_pixel_NW_fourth_range );
// second linear interpolation: EWbottom
// EWbottom = SW + EWweight*(SE-SW)
//
// first range
vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
vf_EWbottom_first_range_tmp,
vf_pixel_SW_first_range );
// second range
vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
vf_EWbottom_second_range_tmp,
vf_pixel_SW_second_range );
// first range
vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
vf_EWbottom_third_range_tmp,
vf_pixel_SW_third_range );
// first range
vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
vf_EWbottom_fourth_range_tmp,
vf_pixel_SW_fourth_range );
// third linear interpolation: the bilinear interpolated value
// result = EWtop + NSweight*(EWbottom-EWtop);
//
// first range
vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
vector float vf_result_first_range = spu_madd( vf_NSweight,
vf_result_first_range_tmp,
vf_EWtop_first_range );
// second range
vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
vector float vf_result_second_range = spu_madd( vf_NSweight,
vf_result_second_range_tmp,
vf_EWtop_second_range );
// third range
vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
vector float vf_result_third_range = spu_madd( vf_NSweight,
vf_result_third_range_tmp,
vf_EWtop_third_range );
// fourth range
vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
vector float vf_result_fourth_range = spu_madd( vf_NSweight,
vf_result_fourth_range_tmp,
vf_EWtop_fourth_range );
// convert back: using saturated arithmetic
vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
// merge results->lower,upper
vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
0x13, 0x17, 0x1B, 0x1F,
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00 };
vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
0x03, 0x07, 0x0B, 0x0F,
0x13, 0x17, 0x1B, 0x1F };
vector unsigned char vuc_result_first_second =
spu_shuffle( (vector unsigned char) vui_result_first_range,
(vector unsigned char) vui_result_second_range,
vuc_mask_merge_result_first_second );
vector unsigned char vuc_result_third_fourth =
spu_shuffle( (vector unsigned char) vui_result_third_range,
(vector unsigned char) vui_result_fourth_range,
vuc_mask_merge_result_third_fourth );
// store result
*((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
vuc_result_third_fourth );
dst += 16;
}
}