distrib/sdl-1.2.15/src/video/ps3/spulibs/bilin_scaler.c - emulator-unstable-refactoring - Git at Google

 /*
  * SDL - Simple DirectMedia Layer
  * CELL BE Support for PS3 Framebuffer
  * Copyright (C) 2008, 2009 International Business Machines Corporation
  *
  * This library is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation; either version 2.1 of the License, or
  * (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
  * USA
  *
  *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
  *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
  *  SPE code based on research by:
  *  Rene Becker
  *  Thimo Emmerich
  */

 #include "spu_common.h"

 #include <spu_intrinsics.h>
 #include <spu_mfcio.h>

 // Debugging
 //#define DEBUG

 #ifdef DEBUG
 #define deprintf(fmt, args... ) \
 	fprintf( stdout, fmt, ##args ); \
 	fflush( stdout );
 #else
 #define deprintf( fmt, args... )
 #endif

 struct scale_parms_t parms __attribute__((aligned(128)));

 /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
  * there might be the need to retrieve misaligned data, adjust
  * incoming v and u plane to be able to handle this (add 128)
  */
 unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
 unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
 unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));

 /* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
 unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
 unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
 unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));

 /* some vectors needed by the float to int conversion */
 static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
 static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };

 void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
 void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);

 void scale_srcw16_dstw16();
 void scale_srcw16_dstw32();
 void scale_srcw32_dstw16();
 void scale_srcw32_dstw32();

 int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
 {
 	deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
 	/* DMA transfer for the input parameters */
 	spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
 	DMA_WAIT_TAG(TAG_INIT);

 	deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
 			parms.dst_pixel_width, parms.dst_pixel_height);

 	if(parms.src_pixel_width & 0x1f) {
 		if(parms.dst_pixel_width & 0x1F) {
 			deprintf("[SPU] Using scale_srcw16_dstw16\n");
 			scale_srcw16_dstw16();
 		} else {
 			deprintf("[SPU] Using scale_srcw16_dstw32\n");
 			scale_srcw16_dstw32();
 		}
 	} else {
 		if(parms.dst_pixel_width & 0x1F) {
 			deprintf("[SPU] Using scale_srcw32_dstw16\n");
 			scale_srcw32_dstw16();
 		} else {
 			deprintf("[SPU] Using scale_srcw32_dstw32\n");
 			scale_srcw32_dstw32();
 		}
 	}
 	deprintf("[SPU] bilin_scaler_spu... done!\n");

 	return 0;
 }


 /*
  * vfloat_to_vuint()
  *
  * converts a float vector to an unsinged int vector using saturated
  * arithmetic
  *
  * @param vec_s float vector for conversion
  * @returns converted unsigned int vector
  */
 inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
 	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
 	vec_s = spu_sel(vec_s, vec_0_1, select_1);

 	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
 	vec_s = spu_sel(vec_s, vec_255, select_2);
 	return spu_convtu(vec_s,0);
 }


 /*
  * scale_srcw16_dstw16()
  *
  * processes an input image of width 16
  * scaling is done to a width 16
  * result stored in RAM
  */
 void scale_srcw16_dstw16() {
 	// extract parameters
 	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;

 	unsigned int src_width = parms.src_pixel_width;
 	unsigned int src_height = parms.src_pixel_height;
 	unsigned int dst_width = parms.dst_pixel_width;
 	unsigned int dst_height = parms.dst_pixel_height;

 	// YVU
 	unsigned int src_linestride_y = src_width;
 	unsigned int src_dbl_linestride_y = src_width<<1;
 	unsigned int src_linestride_vu = src_width>>1;
 	unsigned int src_dbl_linestride_vu = src_width;

 	// scaled YVU
 	unsigned int scaled_src_linestride_y = dst_width;

 	// ram addresses
 	unsigned char* src_addr_y = parms.y_plane;
 	unsigned char* src_addr_v = parms.v_plane;
 	unsigned char* src_addr_u = parms.u_plane;

 	// for handling misalignment, addresses are precalculated
 	unsigned char* precalc_src_addr_v = src_addr_v;
 	unsigned char* precalc_src_addr_u = src_addr_u;

 	unsigned int dst_picture_size = dst_width*dst_height;

 	// Sizes for destination
 	unsigned int dst_dbl_linestride_y = dst_width<<1;
 	unsigned int dst_dbl_linestride_vu = dst_width>>1;

 	// Perform address calculation for Y, V and U in main memory with dst_addr as base
 	unsigned char* dst_addr_main_memory_y = dst_addr;
 	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
 	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);

 	// calculate scale factors
 	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
 	float y_scale = (float)src_height/(float)dst_height;

 	// double buffered processing
 	// buffer switching
 	unsigned int curr_src_idx = 0;
 	unsigned int curr_dst_idx = 0;
 	unsigned int next_src_idx, next_dst_idx;

 	// 2 lines y as output, upper and lowerline
 	unsigned int curr_interpl_y_upper = 0;
 	unsigned int next_interpl_y_upper;
 	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
 	// only 1 line v/u output, both planes have the same dimension
 	unsigned int curr_interpl_vu = 0;
 	unsigned int next_interpl_vu;

 	// weights, calculated in every loop iteration
 	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
 	vector float vf_next_NSweight_y_upper;
 	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
 	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
 	vector float vf_next_NSweight_vu;

 	// line indices for the src picture
 	float curr_src_y_upper = 0.0f, next_src_y_upper;
 	float curr_src_y_lower, next_src_y_lower;
 	float curr_src_vu = 0.0f, next_src_vu;

 	// line indices for the dst picture
 	unsigned int dst_y=0, dst_vu=0;

 	// offset for the v and u plane to handle misalignement
 	unsigned int curr_lsoff_v = 0, next_lsoff_v;
 	unsigned int curr_lsoff_u = 0, next_lsoff_u;

 	// calculate lower line indices
 	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
 	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
 	// lower line weight
 	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );


 	// start partially double buffered processing
 	// get initial data, 2 sets of y, 1 set v, 1 set u
 	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
 	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
 			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
 			src_dbl_linestride_y,
 			RETR_BUF,
 			0, 0 );
 	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );

 	/* iteration loop
 	 * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
 	 * the scaled output is 2 lines y, 1 line v, 1 line u
 	 * the yuv2rgb-converted output is stored to RAM
 	 */
 	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
 		dst_y = dst_vu<<1;

 		// calculate next indices
 		next_src_vu = ((float)dst_vu+1)*y_scale;
 		next_src_y_upper = ((float)dst_y+2)*y_scale;
 		next_src_y_lower = ((float)dst_y+3)*y_scale;

 		next_interpl_vu = (unsigned int) next_src_vu;
 		next_interpl_y_upper = (unsigned int) next_src_y_upper;
 		next_interpl_y_lower = (unsigned int) next_src_y_lower;

 		// calculate weight NORTH-SOUTH
 		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
 		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
 		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );

 		// get next lines
 		next_src_idx = curr_src_idx^1;
 		next_dst_idx = curr_dst_idx^1;

 		// 4 lines y
 		mfc_get( y_plane[next_src_idx],
 				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
 				src_dbl_linestride_y,
 				RETR_BUF+next_src_idx,
 				0, 0 );
 		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
 				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
 				src_dbl_linestride_y,
 				RETR_BUF+next_src_idx,
 				0, 0 );

 		// 2 lines v
 		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
 		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
 		mfc_get( v_plane[next_src_idx],
 				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
 				src_dbl_linestride_vu+(next_lsoff_v<<1),
 				RETR_BUF+next_src_idx,
 				0, 0 );
 		// 2 lines u
 		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
 		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
 		mfc_get( u_plane[next_src_idx],
 				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
 				src_dbl_linestride_vu+(next_lsoff_v<<1),
 				RETR_BUF+next_src_idx,
 				0, 0 );

 		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

 		// scaling
 		// work line y_upper
 		bilinear_scale_line_w16( y_plane[curr_src_idx],
 				scaled_y_plane[curr_src_idx],
 				dst_width,
 				vf_x_scale,
 				vf_curr_NSweight_y_upper,
 				src_linestride_y );
 		// work line y_lower
 		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 				dst_width,
 				vf_x_scale,
 				vf_curr_NSweight_y_lower,
 				src_linestride_y );
 		// work line v
 		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
 				scaled_v_plane[curr_src_idx],
 				dst_width>>1,
 				vf_x_scale,
 				vf_curr_NSweight_vu,
 				src_linestride_vu );
 		// work line u
 		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
 				scaled_u_plane[curr_src_idx],
 				dst_width>>1,
 				vf_x_scale,
 				vf_curr_NSweight_vu,
 				src_linestride_vu );


 		// Store the result back to main memory into a destination buffer in YUV format
 		//---------------------------------------------------------------------------------------------
 		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

 		// Perform three DMA transfers to 3 different locations in the main memory!
 		// dst_width:	Pixel width of destination image
 		// dst_addr:	Destination address in main memory
 		// dst_vu:	Counter which is incremented one by one
 		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 		mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
 				(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 				dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,						// Tag
 				0, 0 );

 		mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
 				(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 				dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,						// Tag
 				0, 0 );

 		mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
 				(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 				dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,						// Tag
 				0, 0 );
 		//---------------------------------------------------------------------------------------------


 		// update for next cycle
 		curr_src_idx = next_src_idx;
 		curr_dst_idx = next_dst_idx;

 		curr_interpl_y_upper = next_interpl_y_upper;
 		curr_interpl_y_lower = next_interpl_y_lower;
 		curr_interpl_vu = next_interpl_vu;

 		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
 		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
 		vf_curr_NSweight_vu = vf_next_NSweight_vu;

 		curr_src_y_upper = next_src_y_upper;
 		curr_src_y_lower = next_src_y_lower;
 		curr_src_vu = next_src_vu;

 		curr_lsoff_v = next_lsoff_v;
 		curr_lsoff_u = next_lsoff_u;
 	}


 	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

 	// scaling
 	// work line y_upper
 	bilinear_scale_line_w16( y_plane[curr_src_idx],
 			scaled_y_plane[curr_src_idx],
 			dst_width,
 			vf_x_scale,
 			vf_curr_NSweight_y_upper,
 			src_linestride_y );
 	// work line y_lower
 	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 			dst_width,
 			vf_x_scale,
 			vf_curr_NSweight_y_lower,
 			src_linestride_y );
 	// work line v
 	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
 			scaled_v_plane[curr_src_idx],
 			dst_width>>1,
 			vf_x_scale,
 			vf_curr_NSweight_vu,
 			src_linestride_vu );
 	// work line u
 	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
 			scaled_u_plane[curr_src_idx],
 			dst_width>>1,
 			vf_x_scale,
 			vf_curr_NSweight_vu,
 			src_linestride_vu );


 	// Store the result back to main memory into a destination buffer in YUV format
 	//---------------------------------------------------------------------------------------------
 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

 	// Perform three DMA transfers to 3 different locations in the main memory!
 	// dst_width:	Pixel width of destination image
 	// dst_addr:	Destination address in main memory
 	// dst_vu:	Counter which is incremented one by one
 	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 	mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
 			(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 			dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,						// Tag
 			0, 0 );

 	mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
 			(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 			dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,						// Tag
 			0, 0 );

 	mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
 			(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 			dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,						// Tag
 			0, 0 );

 	// wait for completion
 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 	//---------------------------------------------------------------------------------------------
 }


 /*
  * scale_srcw16_dstw32()
  *
  * processes an input image of width 16
  * scaling is done to a width 32
  * yuv2rgb conversion on a width of 32
  * result stored in RAM
  */
 void scale_srcw16_dstw32() {
 	// extract parameters
 	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;

 	unsigned int src_width = parms.src_pixel_width;
 	unsigned int src_height = parms.src_pixel_height;
 	unsigned int dst_width = parms.dst_pixel_width;
 	unsigned int dst_height = parms.dst_pixel_height;

 	// YVU
 	unsigned int src_linestride_y = src_width;
 	unsigned int src_dbl_linestride_y = src_width<<1;
 	unsigned int src_linestride_vu = src_width>>1;
 	unsigned int src_dbl_linestride_vu = src_width;
 	// scaled YVU
 	unsigned int scaled_src_linestride_y = dst_width;

 	// ram addresses
 	unsigned char* src_addr_y = parms.y_plane;
 	unsigned char* src_addr_v = parms.v_plane;
 	unsigned char* src_addr_u = parms.u_plane;

 	unsigned int dst_picture_size = dst_width*dst_height;

 	// Sizes for destination
 	unsigned int dst_dbl_linestride_y = dst_width<<1;
 	unsigned int dst_dbl_linestride_vu = dst_width>>1;

 	// Perform address calculation for Y, V and U in main memory with dst_addr as base
 	unsigned char* dst_addr_main_memory_y = dst_addr;
 	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
 	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);


 	// for handling misalignment, addresses are precalculated
 	unsigned char* precalc_src_addr_v = src_addr_v;
 	unsigned char* precalc_src_addr_u = src_addr_u;

 	// calculate scale factors
 	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
 	float y_scale = (float)src_height/(float)dst_height;

 	// double buffered processing
 	// buffer switching
 	unsigned int curr_src_idx = 0;
 	unsigned int curr_dst_idx = 0;
 	unsigned int next_src_idx, next_dst_idx;

 	// 2 lines y as output, upper and lowerline
 	unsigned int curr_interpl_y_upper = 0;
 	unsigned int next_interpl_y_upper;
 	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
 	// only 1 line v/u output, both planes have the same dimension
 	unsigned int curr_interpl_vu = 0;
 	unsigned int next_interpl_vu;

 	// weights, calculated in every loop iteration
 	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
 	vector float vf_next_NSweight_y_upper;
 	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
 	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
 	vector float vf_next_NSweight_vu;

 	// line indices for the src picture
 	float curr_src_y_upper = 0.0f, next_src_y_upper;
 	float curr_src_y_lower, next_src_y_lower;
 	float curr_src_vu = 0.0f, next_src_vu;

 	// line indices for the dst picture
 	unsigned int dst_y=0, dst_vu=0;

 	// offset for the v and u plane to handle misalignement
 	unsigned int curr_lsoff_v = 0, next_lsoff_v;
 	unsigned int curr_lsoff_u = 0, next_lsoff_u;

 	// calculate lower line idices
 	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
 	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
 	// lower line weight
 	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );


 	// start partially double buffered processing
 	// get initial data, 2 sets of y, 1 set v, 1 set u
 	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
 	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
 			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
 			src_dbl_linestride_y,
 			RETR_BUF,
 			0, 0 );
 	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );

 	// iteration loop
 	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
 	// the scaled output is 2 lines y, 1 line v, 1 line u
 	// the yuv2rgb-converted output is stored to RAM
 	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
 		dst_y = dst_vu<<1;

 		// calculate next indices
 		next_src_vu = ((float)dst_vu+1)*y_scale;
 		next_src_y_upper = ((float)dst_y+2)*y_scale;
 		next_src_y_lower = ((float)dst_y+3)*y_scale;

 		next_interpl_vu = (unsigned int) next_src_vu;
 		next_interpl_y_upper = (unsigned int) next_src_y_upper;
 		next_interpl_y_lower = (unsigned int) next_src_y_lower;

 		// calculate weight NORTH-SOUTH
 		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
 		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
 		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );

 		// get next lines
 		next_src_idx = curr_src_idx^1;
 		next_dst_idx = curr_dst_idx^1;

 		// 4 lines y
 		mfc_get( y_plane[next_src_idx],
 				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
 				src_dbl_linestride_y,
 				RETR_BUF+next_src_idx,
 				0, 0 );
 		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
 				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
 				src_dbl_linestride_y,
 				RETR_BUF+next_src_idx,
 				0, 0 );

 		// 2 lines v
 		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
 		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
 		mfc_get( v_plane[next_src_idx],
 				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
 				src_dbl_linestride_vu+(next_lsoff_v<<1),
 				RETR_BUF+next_src_idx,
 				0, 0 );
 		// 2 lines u
 		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
 		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
 		mfc_get( u_plane[next_src_idx],
 				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
 				src_dbl_linestride_vu+(next_lsoff_v<<1),
 				RETR_BUF+next_src_idx,
 				0, 0 );

 		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

 		// scaling
 		// work line y_upper
 		bilinear_scale_line_w16( y_plane[curr_src_idx],
 				scaled_y_plane[curr_src_idx],
 				dst_width,
 				vf_x_scale,
 				vf_curr_NSweight_y_upper,
 				src_linestride_y );
 		// work line y_lower
 		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 				dst_width,
 				vf_x_scale,
 				vf_curr_NSweight_y_lower,
 				src_linestride_y );
 		// work line v
 		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
 				scaled_v_plane[curr_src_idx],
 				dst_width>>1,
 				vf_x_scale,
 				vf_curr_NSweight_vu,
 				src_linestride_vu );
 		// work line u
 		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
 				scaled_u_plane[curr_src_idx],
 				dst_width>>1,
 				vf_x_scale,
 				vf_curr_NSweight_vu,
 				src_linestride_vu );

 		//---------------------------------------------------------------------------------------------
 		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

 		// Perform three DMA transfers to 3 different locations in the main memory!
 		// dst_width:	Pixel width of destination image
 		// dst_addr:	Destination address in main memory
 		// dst_vu:	Counter which is incremented one by one
 		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

 		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,								// Tag
 				0, 0 );

 		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,								// Tag
 				0, 0 );

 		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,								// Tag
 				0, 0 );
 		//---------------------------------------------------------------------------------------------


 		// update for next cycle
 		curr_src_idx = next_src_idx;
 		curr_dst_idx = next_dst_idx;

 		curr_interpl_y_upper = next_interpl_y_upper;
 		curr_interpl_y_lower = next_interpl_y_lower;
 		curr_interpl_vu = next_interpl_vu;

 		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
 		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
 		vf_curr_NSweight_vu = vf_next_NSweight_vu;

 		curr_src_y_upper = next_src_y_upper;
 		curr_src_y_lower = next_src_y_lower;
 		curr_src_vu = next_src_vu;

 		curr_lsoff_v = next_lsoff_v;
 		curr_lsoff_u = next_lsoff_u;
 	}


 	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

 	// scaling
 	// work line y_upper
 	bilinear_scale_line_w16( y_plane[curr_src_idx],
 			scaled_y_plane[curr_src_idx],
 			dst_width,
 			vf_x_scale,
 			vf_curr_NSweight_y_upper,
 			src_linestride_y );
 	// work line y_lower
 	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 			dst_width,
 			vf_x_scale,
 			vf_curr_NSweight_y_lower,
 			src_linestride_y );
 	// work line v
 	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
 			scaled_v_plane[curr_src_idx],
 			dst_width>>1,
 			vf_x_scale,
 			vf_curr_NSweight_vu,
 			src_linestride_vu );
 	// work line u
 	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
 			scaled_u_plane[curr_src_idx],
 			dst_width>>1,
 			vf_x_scale,
 			vf_curr_NSweight_vu,
 			src_linestride_vu );

 	//---------------------------------------------------------------------------------------------
 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

 	// Perform three DMA transfers to 3 different locations in the main memory!
 	// dst_width:	Pixel width of destination image
 	// dst_addr:	Destination address in main memory
 	// dst_vu:	Counter which is incremented one by one
 	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

 	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,								// Tag
 			0, 0 );

 	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,								// Tag
 			0, 0 );

 	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,								// Tag
 			0, 0 );

 	// wait for completion
 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 	//---------------------------------------------------------------------------------------------
 }


 /*
  * scale_srcw32_dstw16()
  *
  * processes an input image of width 32
  * scaling is done to a width 16
  * yuv2rgb conversion on a width of 16
  * result stored in RAM
  */
 void scale_srcw32_dstw16() {
 	// extract parameters
 	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;

 	unsigned int src_width = parms.src_pixel_width;
 	unsigned int src_height = parms.src_pixel_height;
 	unsigned int dst_width = parms.dst_pixel_width;
 	unsigned int dst_height = parms.dst_pixel_height;

 	// YVU
 	unsigned int src_linestride_y = src_width;
 	unsigned int src_dbl_linestride_y = src_width<<1;
 	unsigned int src_linestride_vu = src_width>>1;
 	unsigned int src_dbl_linestride_vu = src_width;
 	// scaled YVU
 	unsigned int scaled_src_linestride_y = dst_width;

 	// ram addresses
 	unsigned char* src_addr_y = parms.y_plane;
 	unsigned char* src_addr_v = parms.v_plane;
 	unsigned char* src_addr_u = parms.u_plane;

 	unsigned int dst_picture_size = dst_width*dst_height;

 	// Sizes for destination
 	unsigned int dst_dbl_linestride_y = dst_width<<1;
 	unsigned int dst_dbl_linestride_vu = dst_width>>1;

 	// Perform address calculation for Y, V and U in main memory with dst_addr as base
 	unsigned char* dst_addr_main_memory_y = dst_addr;
 	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
 	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);

 	// calculate scale factors
 	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
 	float y_scale = (float)src_height/(float)dst_height;

 	// double buffered processing
 	// buffer switching
 	unsigned int curr_src_idx = 0;
 	unsigned int curr_dst_idx = 0;
 	unsigned int next_src_idx, next_dst_idx;

 	// 2 lines y as output, upper and lowerline
 	unsigned int curr_interpl_y_upper = 0;
 	unsigned int next_interpl_y_upper;
 	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
 	// only 1 line v/u output, both planes have the same dimension
 	unsigned int curr_interpl_vu = 0;
 	unsigned int next_interpl_vu;

 	// weights, calculated in every loop iteration
 	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
 	vector float vf_next_NSweight_y_upper;
 	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
 	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
 	vector float vf_next_NSweight_vu;

 	// line indices for the src picture
 	float curr_src_y_upper = 0.0f, next_src_y_upper;
 	float curr_src_y_lower, next_src_y_lower;
 	float curr_src_vu = 0.0f, next_src_vu;

 	// line indices for the dst picture
 	unsigned int dst_y=0, dst_vu=0;

 	// calculate lower line idices
 	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
 	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
 	// lower line weight
 	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );


 	// start partially double buffered processing
 	// get initial data, 2 sets of y, 1 set v, 1 set u
 	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
 	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
 			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
 			src_dbl_linestride_y,
 			RETR_BUF,
 			0, 0 );
 	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );

 	// iteration loop
 	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
 	// the scaled output is 2 lines y, 1 line v, 1 line u
 	// the yuv2rgb-converted output is stored to RAM
 	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
 		dst_y = dst_vu<<1;

 		// calculate next indices
 		next_src_vu = ((float)dst_vu+1)*y_scale;
 		next_src_y_upper = ((float)dst_y+2)*y_scale;
 		next_src_y_lower = ((float)dst_y+3)*y_scale;

 		next_interpl_vu = (unsigned int) next_src_vu;
 		next_interpl_y_upper = (unsigned int) next_src_y_upper;
 		next_interpl_y_lower = (unsigned int) next_src_y_lower;

 		// calculate weight NORTH-SOUTH
 		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
 		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
 		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );

 		// get next lines
 		next_src_idx = curr_src_idx^1;
 		next_dst_idx = curr_dst_idx^1;

 		// 4 lines y
 		mfc_get( y_plane[next_src_idx],
 				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
 				src_dbl_linestride_y,
 				RETR_BUF+next_src_idx,
 				0, 0 );
 		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
 				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
 				src_dbl_linestride_y,
 				RETR_BUF+next_src_idx,
 				0, 0 );

 		// 2 lines v
 		mfc_get( v_plane[next_src_idx],
 				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
 				src_dbl_linestride_vu,
 				RETR_BUF+next_src_idx,
 				0, 0 );
 		// 2 lines u
 		mfc_get( u_plane[next_src_idx],
 				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
 				src_dbl_linestride_vu,
 				RETR_BUF+next_src_idx,
 				0, 0 );

 		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

 		// scaling
 		// work line y_upper
 		bilinear_scale_line_w16( y_plane[curr_src_idx],
 				scaled_y_plane[curr_src_idx],
 				dst_width,
 				vf_x_scale,
 				vf_curr_NSweight_y_upper,
 				src_linestride_y );
 		// work line y_lower
 		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 				dst_width,
 				vf_x_scale,
 				vf_curr_NSweight_y_lower,
 				src_linestride_y );
 		// work line v
 		bilinear_scale_line_w16( v_plane[curr_src_idx],
 				scaled_v_plane[curr_src_idx],
 				dst_width>>1,
 				vf_x_scale,
 				vf_curr_NSweight_vu,
 				src_linestride_vu );
 		// work line u
 		bilinear_scale_line_w16( u_plane[curr_src_idx],
 				scaled_u_plane[curr_src_idx],
 				dst_width>>1,
 				vf_x_scale,
 				vf_curr_NSweight_vu,
 				src_linestride_vu );

 		//---------------------------------------------------------------------------------------------
 		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

 		// Perform three DMA transfers to 3 different locations in the main memory!
 		// dst_width:	Pixel width of destination image
 		// dst_addr:	Destination address in main memory
 		// dst_vu:	Counter which is incremented one by one
 		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

 		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,								// Tag
 				0, 0 );

 		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,								// Tag
 				0, 0 );

 		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,								// Tag
 				0, 0 );
 		//---------------------------------------------------------------------------------------------


 		// update for next cycle
 		curr_src_idx = next_src_idx;
 		curr_dst_idx = next_dst_idx;

 		curr_interpl_y_upper = next_interpl_y_upper;
 		curr_interpl_y_lower = next_interpl_y_lower;
 		curr_interpl_vu = next_interpl_vu;

 		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
 		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
 		vf_curr_NSweight_vu = vf_next_NSweight_vu;

 		curr_src_y_upper = next_src_y_upper;
 		curr_src_y_lower = next_src_y_lower;
 		curr_src_vu = next_src_vu;
 	}


 	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

 	// scaling
 	// work line y_upper
 	bilinear_scale_line_w16( y_plane[curr_src_idx],
 			scaled_y_plane[curr_src_idx],
 			dst_width,
 			vf_x_scale,
 			vf_curr_NSweight_y_upper,
 			src_linestride_y );
 	// work line y_lower
 	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 			dst_width,
 			vf_x_scale,
 			vf_curr_NSweight_y_lower,
 			src_linestride_y );
 	// work line v
 	bilinear_scale_line_w16( v_plane[curr_src_idx],
 			scaled_v_plane[curr_src_idx],
 			dst_width>>1,
 			vf_x_scale,
 			vf_curr_NSweight_vu,
 			src_linestride_vu );
 	// work line u
 	bilinear_scale_line_w16( u_plane[curr_src_idx],
 			scaled_u_plane[curr_src_idx],
 			dst_width>>1,
 			vf_x_scale,
 			vf_curr_NSweight_vu,
 			src_linestride_vu );


 	//---------------------------------------------------------------------------------------------
 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

 	// Perform three DMA transfers to 3 different locations in the main memory!
 	// dst_width:	Pixel width of destination image
 	// dst_addr:	Destination address in main memory
 	// dst_vu:	Counter which is incremented one by one
 	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

 	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,								// Tag
 			0, 0 );

 	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,								// Tag
 			0, 0 );

 	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,								// Tag
 			0, 0 );

 	// wait for completion
 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 	//---------------------------------------------------------------------------------------------
 }


 /**
  * scale_srcw32_dstw32()
  *
  * processes an input image of width 32
  * scaling is done to a width 32
  * yuv2rgb conversion on a width of 32
  * result stored in RAM
  */
 void scale_srcw32_dstw32() {
 	// extract parameters
 	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;

 	unsigned int src_width = parms.src_pixel_width;
 	unsigned int src_height = parms.src_pixel_height;
 	unsigned int dst_width = parms.dst_pixel_width;
 	unsigned int dst_height = parms.dst_pixel_height;

 	// YVU
 	unsigned int src_linestride_y = src_width;
 	unsigned int src_dbl_linestride_y = src_width<<1;
 	unsigned int src_linestride_vu = src_width>>1;
 	unsigned int src_dbl_linestride_vu = src_width;

 	// scaled YVU
 	unsigned int scaled_src_linestride_y = dst_width;

 	// ram addresses
 	unsigned char* src_addr_y = parms.y_plane;
 	unsigned char* src_addr_v = parms.v_plane;
 	unsigned char* src_addr_u = parms.u_plane;

 	unsigned int dst_picture_size = dst_width*dst_height;

 	// Sizes for destination
 	unsigned int dst_dbl_linestride_y = dst_width<<1;
 	unsigned int dst_dbl_linestride_vu = dst_width>>1;

 	// Perform address calculation for Y, V and U in main memory with dst_addr as base
 	unsigned char* dst_addr_main_memory_y = dst_addr;
 	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
 	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);

 	// calculate scale factors
 	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
 	float y_scale = (float)src_height/(float)dst_height;

 	// double buffered processing
 	// buffer switching
 	unsigned int curr_src_idx = 0;
 	unsigned int curr_dst_idx = 0;
 	unsigned int next_src_idx, next_dst_idx;

 	// 2 lines y as output, upper and lowerline
 	unsigned int curr_interpl_y_upper = 0;
 	unsigned int next_interpl_y_upper;
 	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
 	// only 1 line v/u output, both planes have the same dimension
 	unsigned int curr_interpl_vu = 0;
 	unsigned int next_interpl_vu;

 	// weights, calculated in every loop iteration
 	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
 	vector float vf_next_NSweight_y_upper;
 	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
 	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
 	vector float vf_next_NSweight_vu;

 	// line indices for the src picture
 	float curr_src_y_upper = 0.0f, next_src_y_upper;
 	float curr_src_y_lower, next_src_y_lower;
 	float curr_src_vu = 0.0f, next_src_vu;

 	// line indices for the dst picture
 	unsigned int dst_y=0, dst_vu=0;

 	// calculate lower line idices
 	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
 	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
 	// lower line weight
 	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );


 	// start partially double buffered processing
 	// get initial data, 2 sets of y, 1 set v, 1 set u
 	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
 	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
 			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
 			src_dbl_linestride_y,
 			RETR_BUF,
 			0, 0 );
 	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );

 	// iteration loop
 	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
 	// the scaled output is 2 lines y, 1 line v, 1 line u
 	// the yuv2rgb-converted output is stored to RAM
 	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
 		dst_y = dst_vu<<1;

 		// calculate next indices
 		next_src_vu = ((float)dst_vu+1)*y_scale;
 		next_src_y_upper = ((float)dst_y+2)*y_scale;
 		next_src_y_lower = ((float)dst_y+3)*y_scale;

 		next_interpl_vu = (unsigned int) next_src_vu;
 		next_interpl_y_upper = (unsigned int) next_src_y_upper;
 		next_interpl_y_lower = (unsigned int) next_src_y_lower;

 		// calculate weight NORTH-SOUTH
 		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
 		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
 		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );

 		// get next lines
 		next_src_idx = curr_src_idx^1;
 		next_dst_idx = curr_dst_idx^1;

 		// 4 lines y
 		mfc_get( y_plane[next_src_idx],
 				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
 				src_dbl_linestride_y,
 				RETR_BUF+next_src_idx,
 				0, 0 );
 		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
 				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
 				src_dbl_linestride_y,
 				RETR_BUF+next_src_idx,
 				0, 0 );

 		// 2 lines v
 		mfc_get( v_plane[next_src_idx],
 				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
 				src_dbl_linestride_vu,
 				RETR_BUF+next_src_idx,
 				0, 0 );
 		// 2 lines u
 		mfc_get( u_plane[next_src_idx],
 				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
 				src_dbl_linestride_vu,
 				RETR_BUF+next_src_idx,
 				0, 0 );

 		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

 		// scaling
 		// work line y_upper
 		bilinear_scale_line_w16( y_plane[curr_src_idx],
 				scaled_y_plane[curr_src_idx],
 				dst_width,
 				vf_x_scale,
 				vf_curr_NSweight_y_upper,
 				src_linestride_y );
 		// work line y_lower
 		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 				dst_width,
 				vf_x_scale,
 				vf_curr_NSweight_y_lower,
 				src_linestride_y );
 		// work line v
 		bilinear_scale_line_w16( v_plane[curr_src_idx],
 				scaled_v_plane[curr_src_idx],
 				dst_width>>1,
 				vf_x_scale,
 				vf_curr_NSweight_vu,
 				src_linestride_vu );
 		// work line u
 		bilinear_scale_line_w16( u_plane[curr_src_idx],
 				scaled_u_plane[curr_src_idx],
 				dst_width>>1,
 				vf_x_scale,
 				vf_curr_NSweight_vu,
 				src_linestride_vu );


 		// Store the result back to main memory into a destination buffer in YUV format
 		//---------------------------------------------------------------------------------------------
 		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

 		// Perform three DMA transfers to 3 different locations in the main memory!
 		// dst_width:	Pixel width of destination image
 		// dst_addr:	Destination address in main memory
 		// dst_vu:	Counter which is incremented one by one
 		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

 		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 				(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,								// Tag
 				0, 0 );

 		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,								// Tag
 				0, 0 );

 		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 				(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 				STR_BUF+curr_dst_idx,								// Tag
 				0, 0 );
 		//---------------------------------------------------------------------------------------------


 		// update for next cycle
 		curr_src_idx = next_src_idx;
 		curr_dst_idx = next_dst_idx;

 		curr_interpl_y_upper = next_interpl_y_upper;
 		curr_interpl_y_lower = next_interpl_y_lower;
 		curr_interpl_vu = next_interpl_vu;

 		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
 		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
 		vf_curr_NSweight_vu = vf_next_NSweight_vu;

 		curr_src_y_upper = next_src_y_upper;
 		curr_src_y_lower = next_src_y_lower;
 		curr_src_vu = next_src_vu;
 	}


 	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

 	// scaling
 	// work line y_upper
 	bilinear_scale_line_w16( y_plane[curr_src_idx],
 			scaled_y_plane[curr_src_idx],
 			dst_width,
 			vf_x_scale,
 			vf_curr_NSweight_y_upper,
 			src_linestride_y );
 	// work line y_lower
 	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 			dst_width,
 			vf_x_scale,
 			vf_curr_NSweight_y_lower,
 			src_linestride_y );
 	// work line v
 	bilinear_scale_line_w16( v_plane[curr_src_idx],
 			scaled_v_plane[curr_src_idx],
 			dst_width>>1,
 			vf_x_scale,
 			vf_curr_NSweight_vu,
 			src_linestride_vu );
 	// work line u
 	bilinear_scale_line_w16( u_plane[curr_src_idx],
 			scaled_u_plane[curr_src_idx],
 			dst_width>>1,
 			vf_x_scale,
 			vf_curr_NSweight_vu,
 			src_linestride_vu );


 	// Store the result back to main memory into a destination buffer in YUV format
 	//---------------------------------------------------------------------------------------------
 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

 	// Perform three DMA transfers to 3 different locations in the main memory!
 	// dst_width:	Pixel width of destination image
 	// dst_addr:	Destination address in main memory
 	// dst_vu:	Counter which is incremented one by one
 	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

 	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,								// Tag
 			0, 0 );

 	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,								// Tag
 			0, 0 );

 	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 			STR_BUF+curr_dst_idx,								// Tag
 			0, 0 );

 	// wait for completion
 	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 	//---------------------------------------------------------------------------------------------
 }


 /*
  * bilinear_scale_line_w8()
  *
  * processes a line of yuv-input, width has to be a multiple of 8
  * scaled yuv-output is written to local store buffer
  *
  * @param src buffer for 2 lines input
  * @param dst_ buffer for 1 line output
  * @param dst_width the width of the destination line
  * @param vf_x_scale a float vector, at each entry is the x_scale-factor
  * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
  * @param src_linestride the stride of the srcline
  */
 void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {

 	unsigned char* dst = dst_;

 	unsigned int dst_x;
 	for( dst_x=0; dst_x<dst_width; dst_x+=8) {
 		// address calculation for loading the 4 surrounding pixel of each calculated
 		// destination pixel
 		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
 		// lower range->first 4 pixel
 		// upper range->next 4 pixel
 		vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
 		vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
 		vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
 		vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );

 		// calculate weight EAST-WEST
 		vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
 		vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
 		vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
 		vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
 		vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
 		vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
 		vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
 		vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
 		vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
 		vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );

 		// calculate address offset
 		//
 		// pixel NORTH WEST
 		vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
 		vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;

 		// pixel NORTH EAST-->(offpixelNW+1)
 		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
 		vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
 		vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );

 		// SOUTH-WEST-->(offpixelNW+src_linestride)
 		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
 		vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
 		vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );

 		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
 		vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
 		vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );

 		// calculate each address
 		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
 		vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
 		vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
 		vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
 		vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );

 		vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
 		vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
 		vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
 		vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );

 		// get each pixel
 		//
 		// scalar load, afterwards insertion into the right position
 		// NORTH WEST
 		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 		vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
 		vuc_pixel_NW_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
 				vuc_pixel_NW_lower_range, 7 );
 		vuc_pixel_NW_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
 				vuc_pixel_NW_lower_range, 11 );
 		vuc_pixel_NW_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
 				vuc_pixel_NW_lower_range, 15 );

 		vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
 		vuc_pixel_NW_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
 				vuc_pixel_NW_upper_range, 7 );
 		vuc_pixel_NW_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
 				vuc_pixel_NW_upper_range, 11 );
 		vuc_pixel_NW_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
 				vuc_pixel_NW_upper_range, 15 );

 		// NORTH EAST
 		vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
 		vuc_pixel_NE_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
 				vuc_pixel_NE_lower_range, 7 );
 		vuc_pixel_NE_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
 				vuc_pixel_NE_lower_range, 11 );
 		vuc_pixel_NE_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
 				vuc_pixel_NE_lower_range, 15 );

 		vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
 		vuc_pixel_NE_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
 				vuc_pixel_NE_upper_range, 7 );
 		vuc_pixel_NE_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
 				vuc_pixel_NE_upper_range, 11 );
 		vuc_pixel_NE_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
 				vuc_pixel_NE_upper_range, 15 );


 		// SOUTH WEST
 		vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
 		vuc_pixel_SW_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
 				vuc_pixel_SW_lower_range, 7 );
 		vuc_pixel_SW_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
 				vuc_pixel_SW_lower_range, 11 );
 		vuc_pixel_SW_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
 				vuc_pixel_SW_lower_range, 15 );

 		vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
 		vuc_pixel_SW_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
 				vuc_pixel_SW_upper_range, 7 );
 		vuc_pixel_SW_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
 				vuc_pixel_SW_upper_range, 11 );
 		vuc_pixel_SW_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
 				vuc_pixel_SW_upper_range, 15 );

 		// SOUTH EAST
 		vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
 		vuc_pixel_SE_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
 				vuc_pixel_SE_lower_range, 7 );
 		vuc_pixel_SE_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
 				vuc_pixel_SE_lower_range, 11 );
 		vuc_pixel_SE_lower_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
 				vuc_pixel_SE_lower_range, 15 );

 		vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
 		vuc_pixel_SE_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
 				vuc_pixel_SE_upper_range, 7 );
 		vuc_pixel_SE_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
 				vuc_pixel_SE_upper_range, 11 );
 		vuc_pixel_SE_upper_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
 				vuc_pixel_SE_upper_range, 15 );


 		// convert to float
 		vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
 		vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );

 		vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
 		vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );

 		vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
 		vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );

 		vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
 		vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );


 		// first linear interpolation: EWtop
 		// EWtop = NW + EWweight*(NE-NW)
 		//
 		// lower range
 		vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
 		vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
 								vf_EWtop_lower_range_tmp,
 								vf_pixel_NW_lower_range );

 		// upper range
 		vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
 		vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
 								vf_EWtop_upper_range_tmp,
 								vf_pixel_NW_upper_range );


 		// second linear interpolation: EWbottom
 		// EWbottom = SW + EWweight*(SE-SW)
 		//
 		// lower range
 		vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
 		vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
 								vf_EWbottom_lower_range_tmp,
 								vf_pixel_SW_lower_range );

 		// upper range
 		vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
 		vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
 								vf_EWbottom_upper_range_tmp,
 								vf_pixel_SW_upper_range );


 		// third linear interpolation: the bilinear interpolated value
 		// result = EWtop + NSweight*(EWbottom-EWtop);
 		//
 		// lower range
 		vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
 		vector float vf_result_lower_range = spu_madd( vf_NSweight,
 								vf_result_lower_range_tmp,
 								vf_EWtop_lower_range );

 		// upper range
 		vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
 		vector float vf_result_upper_range = spu_madd( vf_NSweight,
 								vf_result_upper_range_tmp,
 								vf_EWtop_upper_range );


 		// convert back: using saturated arithmetic
 		vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
 		vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );

 		// merge results->lower,upper
 		vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
 							       0x13, 0x17, 0x1B, 0x1F,
 							       0x00, 0x00, 0x00, 0x00,
 							       0x00, 0x00, 0x00, 0x00 };

 		vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
 								(vector unsigned char) vui_result_upper_range,
 								vuc_mask_merge_result );

 		// partial storing
 		vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
 						      0x00, 0x00, 0x00, 0x00,
 						      0xFF, 0xFF, 0xFF, 0xFF,
 						      0xFF, 0xFF, 0xFF, 0xFF };


 		// get currently stored data
 		vector unsigned char vuc_orig = *((vector unsigned char*)dst);

 		// clear currently stored data
 		vuc_orig = spu_and( vuc_orig,
 				spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );

 		// rotate result according to storing address
 		vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );

 		// store result
 		*((vector unsigned char*)dst) = spu_or( vuc_result,
 							vuc_orig );
 		dst += 8;
 	}
 }


 /*
  * bilinear_scale_line_w16()
  *
  * processes a line of yuv-input, width has to be a multiple of 16
  * scaled yuv-output is written to local store buffer
  *
  * @param src buffer for 2 lines input
  * @param dst_ buffer for 1 line output
  * @param dst_width the width of the destination line
  * @param vf_x_scale a float vector, at each entry is the x_scale-factor
  * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
  * @param src_linestride the stride of the srcline
  */
 void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {

 	unsigned char* dst = dst_;

 	unsigned int dst_x;
 	for( dst_x=0; dst_x<dst_width; dst_x+=16) {
 		// address calculation for loading the 4 surrounding pixel of each calculated
 		// destination pixel
 		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
 		// parallelised processing
 		// first range->pixel 1 2 3 4
 		// second range->pixel 5 6 7 8
 		// third range->pixel 9 10 11 12
 		// fourth range->pixel 13 14 15 16
 		vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
 		vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
 		vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
 		vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
 		vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
 		vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
 		vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
 		vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );

 		// calculate weight EAST-WEST
 		vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
 		vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
 		vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
 		vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
 		vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
 		vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
 		vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
 		vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
 		vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
 		vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
 		vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
 		vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
 		vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
 		vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
 		vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
 		vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
 		vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
 		vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
 		vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
 		vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );

 		// calculate address offset
 		//
 		// pixel NORTH WEST
 		vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
 		vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
 		vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
 		vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;

 		// pixel NORTH EAST-->(offpixelNW+1)
 		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
 		vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
 		vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
 		vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
 		vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );

 		// SOUTH-WEST-->(offpixelNW+src_linestride)
 		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
 		vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
 		vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
 		vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
 		vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );

 		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
 		vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
 		vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
 		vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
 		vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );

 		// calculate each address
 		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
 		vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
 		vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
 		vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
 		vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );

 		vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
 		vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
 		vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
 		vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );

 		vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
 		vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
 		vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
 		vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );

 		vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
 		vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
 		vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
 		vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );


 		// get each pixel
 		//
 		// scalar load, afterwards insertion into the right position
 		// NORTH WEST
 		// first range
 		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 		vector unsigned char vuc_pixel_NW_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
 		vuc_pixel_NW_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
 				vuc_pixel_NW_first_range, 7 );
 		vuc_pixel_NW_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
 				vuc_pixel_NW_first_range, 11 );
 		vuc_pixel_NW_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
 				vuc_pixel_NW_first_range, 15 );
 		// second range
 		vector unsigned char vuc_pixel_NW_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
 		vuc_pixel_NW_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
 				vuc_pixel_NW_second_range, 7 );
 		vuc_pixel_NW_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
 				vuc_pixel_NW_second_range, 11 );
 		vuc_pixel_NW_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
 				vuc_pixel_NW_second_range, 15 );
 		// third range
 		vector unsigned char vuc_pixel_NW_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
 		vuc_pixel_NW_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
 				vuc_pixel_NW_third_range, 7 );
 		vuc_pixel_NW_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
 				vuc_pixel_NW_third_range, 11 );
 		vuc_pixel_NW_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
 				vuc_pixel_NW_third_range, 15 );
 		// fourth range
 		vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
 		vuc_pixel_NW_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
 				vuc_pixel_NW_fourth_range, 7 );
 		vuc_pixel_NW_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
 				vuc_pixel_NW_fourth_range, 11 );
 		vuc_pixel_NW_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
 				vuc_pixel_NW_fourth_range, 15 );

 		// NORTH EAST
 		// first range
 		vector unsigned char vuc_pixel_NE_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
 		vuc_pixel_NE_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
 				vuc_pixel_NE_first_range, 7 );
 		vuc_pixel_NE_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
 				vuc_pixel_NE_first_range, 11 );
 		vuc_pixel_NE_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
 				vuc_pixel_NE_first_range, 15 );
 		// second range
 		vector unsigned char vuc_pixel_NE_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
 		vuc_pixel_NE_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
 				vuc_pixel_NE_second_range, 7 );
 		vuc_pixel_NE_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
 				vuc_pixel_NE_second_range, 11 );
 		vuc_pixel_NE_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
 				vuc_pixel_NE_second_range, 15 );
 		// third range
 		vector unsigned char vuc_pixel_NE_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
 		vuc_pixel_NE_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
 				vuc_pixel_NE_third_range, 7 );
 		vuc_pixel_NE_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
 				vuc_pixel_NE_third_range, 11 );
 		vuc_pixel_NE_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
 				vuc_pixel_NE_third_range, 15 );
 		// fourth range
 		vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
 		vuc_pixel_NE_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
 				vuc_pixel_NE_fourth_range, 7 );
 		vuc_pixel_NE_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
 				vuc_pixel_NE_fourth_range, 11 );
 		vuc_pixel_NE_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
 				vuc_pixel_NE_fourth_range, 15 );

 		// SOUTH WEST
 		// first range
 		vector unsigned char vuc_pixel_SW_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
 		vuc_pixel_SW_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
 				vuc_pixel_SW_first_range, 7 );
 		vuc_pixel_SW_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
 				vuc_pixel_SW_first_range, 11 );
 		vuc_pixel_SW_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
 				vuc_pixel_SW_first_range, 15 );
 		// second range
 		vector unsigned char vuc_pixel_SW_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
 		vuc_pixel_SW_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
 				vuc_pixel_SW_second_range, 7 );
 		vuc_pixel_SW_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
 				vuc_pixel_SW_second_range, 11 );
 		vuc_pixel_SW_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
 				vuc_pixel_SW_second_range, 15 );
 		// third range
 		vector unsigned char vuc_pixel_SW_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
 		vuc_pixel_SW_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
 				vuc_pixel_SW_third_range, 7 );
 		vuc_pixel_SW_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
 				vuc_pixel_SW_third_range, 11 );
 		vuc_pixel_SW_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
 				vuc_pixel_SW_third_range, 15 );
 		// fourth range
 		vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
 		vuc_pixel_SW_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
 				vuc_pixel_SW_fourth_range, 7 );
 		vuc_pixel_SW_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
 				vuc_pixel_SW_fourth_range, 11 );
 		vuc_pixel_SW_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
 				vuc_pixel_SW_fourth_range, 15 );

 		// NORTH EAST
 		// first range
 		vector unsigned char vuc_pixel_SE_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
 		vuc_pixel_SE_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
 				vuc_pixel_SE_first_range, 7 );
 		vuc_pixel_SE_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
 				vuc_pixel_SE_first_range, 11 );
 		vuc_pixel_SE_first_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
 				vuc_pixel_SE_first_range, 15 );
 		// second range
 		vector unsigned char vuc_pixel_SE_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
 		vuc_pixel_SE_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
 				vuc_pixel_SE_second_range, 7 );
 		vuc_pixel_SE_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
 				vuc_pixel_SE_second_range, 11 );
 		vuc_pixel_SE_second_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
 				vuc_pixel_SE_second_range, 15 );
 		// third range
 		vector unsigned char vuc_pixel_SE_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
 		vuc_pixel_SE_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
 				vuc_pixel_SE_third_range, 7 );
 		vuc_pixel_SE_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
 				vuc_pixel_SE_third_range, 11 );
 		vuc_pixel_SE_third_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
 				vuc_pixel_SE_third_range, 15 );
 		// fourth range
 		vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
 		vuc_pixel_SE_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
 				vuc_pixel_SE_fourth_range, 7 );
 		vuc_pixel_SE_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
 				vuc_pixel_SE_fourth_range, 11 );
 		vuc_pixel_SE_fourth_range = spu_insert(
 				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
 				vuc_pixel_SE_fourth_range, 15 );


 		// convert to float
 		vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
 		vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
 		vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
 		vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );

 		vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
 		vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
 		vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
 		vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );

 		vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
 		vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
 		vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
 		vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );

 		vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
 		vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
 		vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
 		vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );

 		// first linear interpolation: EWtop
 		// EWtop = NW + EWweight*(NE-NW)
 		//
 		// first range
 		vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
 		vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
 								vf_EWtop_first_range_tmp,
 								vf_pixel_NW_first_range );

 		// second range
 		vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
 		vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
 								vf_EWtop_second_range_tmp,
 								vf_pixel_NW_second_range );

 		// third range
 		vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
 		vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
 								vf_EWtop_third_range_tmp,
 								vf_pixel_NW_third_range );

 		// fourth range
 		vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
 		vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
 								vf_EWtop_fourth_range_tmp,
 								vf_pixel_NW_fourth_range );


 		// second linear interpolation: EWbottom
 		// EWbottom = SW + EWweight*(SE-SW)
 		//
 		// first range
 		vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
 		vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
 								vf_EWbottom_first_range_tmp,
 								vf_pixel_SW_first_range );

 		// second range
 		vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
 		vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
 								vf_EWbottom_second_range_tmp,
 								vf_pixel_SW_second_range );
 		// first range
 		vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
 		vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
 								vf_EWbottom_third_range_tmp,
 								vf_pixel_SW_third_range );

 		// first range
 		vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
 		vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
 								vf_EWbottom_fourth_range_tmp,
 								vf_pixel_SW_fourth_range );


 		// third linear interpolation: the bilinear interpolated value
 		// result = EWtop + NSweight*(EWbottom-EWtop);
 		//
 		// first range
 		vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
 		vector float vf_result_first_range = spu_madd( vf_NSweight,
 								vf_result_first_range_tmp,
 								vf_EWtop_first_range );

 		// second range
 		vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
 		vector float vf_result_second_range = spu_madd( vf_NSweight,
 								vf_result_second_range_tmp,
 								vf_EWtop_second_range );

 		// third range
 		vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
 		vector float vf_result_third_range = spu_madd( vf_NSweight,
 								vf_result_third_range_tmp,
 								vf_EWtop_third_range );

 		// fourth range
 		vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
 		vector float vf_result_fourth_range = spu_madd( vf_NSweight,
 								vf_result_fourth_range_tmp,
 								vf_EWtop_fourth_range );


 		// convert back: using saturated arithmetic
 		vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
 		vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
 		vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
 		vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );

 		// merge results->lower,upper
 		vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
 							       		    0x13, 0x17, 0x1B, 0x1F,
 							       		    0x00, 0x00, 0x00, 0x00,
 							       		    0x00, 0x00, 0x00, 0x00 };

 		vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
 							       		    0x00, 0x00, 0x00, 0x00,
 									    0x03, 0x07, 0x0B, 0x0F,
 							       		    0x13, 0x17, 0x1B, 0x1F };

 		vector unsigned char vuc_result_first_second =
 						spu_shuffle( (vector unsigned char) vui_result_first_range,
 								 (vector unsigned char) vui_result_second_range,
 								vuc_mask_merge_result_first_second );

 		vector unsigned char vuc_result_third_fourth =
 						spu_shuffle( (vector unsigned char) vui_result_third_range,
 								 (vector unsigned char) vui_result_fourth_range,
 								vuc_mask_merge_result_third_fourth );

 		// store result
 		*((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
 							vuc_result_third_fourth );
 		dst += 16;
 	}
 }