33 inline void LeRasterizer::fillFlatTexAlphaZC(
int y,
float x1,
float x2,
float w1,
float w2,
float u1,
float u2,
float v1,
float v2)
36 if (d == 0.0f)
return;
39 float au = (u2 - u1) *
id;
40 float av = (v2 - v1) *
id;
41 float aw = (w2 - w1) *
id;
43 __m128 u_4 = _mm_set_ps(u1 + 3.0f * au, u1 + 2.0f * au, u1 + au, u1);
44 __m128 v_4 = _mm_set_ps(v1 + 3.0f * av, v1 + 2.0f * av, v1 + av, v1);
45 __m128 w_4 = _mm_set_ps(w1 + 3.0f * aw, w1 + 2.0f * aw, w1 + aw, w1);
47 __m128 au_4 = _mm_set1_ps(au * 4.0f);
48 __m128 av_4 = _mm_set1_ps(av * 4.0f);
49 __m128 aw_4 = _mm_set1_ps(aw * 4.0f);
52 int xe = (int)(x2 + 1.9999f);
56 int b = (xe - xb) >> 2;
57 int r = (xe - xb) & 0x3;
59 __m128i sc = _mm_set1_epi32(0x01000100);
60 for (
int x = 0; x < b; x ++) {
61 __m128 z_4 = _mm_rcp_ps(w_4);
64 mu_4 = _mm_mul_ps(u_4, z_4);
65 mv_4 = _mm_mul_ps(v_4, z_4);
66 mv_4 = _mm_mul_ps(mv_4, texScale_4);
69 mui_4 = _mm_cvtps_epi32(mu_4);
70 mvi_4 = _mm_cvtps_epi32(mv_4);
71 mui_4 = _mm_and_si128(mui_4, texMaskU_4);
72 mvi_4 = _mm_and_si128(mvi_4, texMaskV_4);
73 mui_4 = _mm_add_epi32(mui_4, mvi_4);
75 __m128i zv = _mm_set1_epi32(0);
76 __m128i tp, tq, fp, t1, t2;
79 fp = _mm_loadl_epi64((__m128i *) p);
80 tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[0]]);
81 tq = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[1]]);
82 t1 = _mm_unpacklo_epi32(tp, tq);
83 fp = _mm_unpacklo_epi8(fp, zv);
84 t1 = _mm_unpacklo_epi8(t1, zv);
86 apl = _mm_shufflelo_epi16(t1, 0xFF);
87 aph = _mm_shufflehi_epi16(t1, 0xFF);
88 ap = _mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(aph), _mm_castsi128_pd(apl)));
89 ap = _mm_sub_epi16(sc, ap);
91 t1 = _mm_mullo_epi16(t1, color_4);
92 fp = _mm_mullo_epi16(fp, ap);
93 t1 = _mm_adds_epu16(t1, fp);
94 t1 = _mm_srli_epi16(t1, 8);
96 fp = _mm_loadl_epi64((__m128i *) (p+2));
97 tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[2]]);
98 tq = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[3]]);
99 t2 = _mm_unpacklo_epi32(tp, tq);
100 fp = _mm_unpacklo_epi8(fp, zv);
101 t2 = _mm_unpacklo_epi8(t2, zv);
103 apl = _mm_shufflelo_epi16(t2, 0xFF);
104 aph = _mm_shufflehi_epi16(t2, 0xFF);
105 ap = _mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(aph), _mm_castsi128_pd(apl)));
106 ap = _mm_sub_epi16(sc, ap);
108 t2 = _mm_mullo_epi16(t2, color_4);
109 fp = _mm_mullo_epi16(fp, ap);
110 t2 = _mm_adds_epu16(t2, fp);
111 t2 = _mm_srli_epi16(t2, 8);
112 tp = _mm_packus_epi16(t1, t2);
113 _mm_storeu_si128((__m128i *) p, tp);
116 w_4 = _mm_add_ps(w_4, aw_4);
117 u_4 = _mm_add_ps(u_4, au_4);
118 v_4 = _mm_add_ps(v_4, av_4);
123 __m128 z_4 = _mm_rcp_ps(w_4);
126 mu_4 = _mm_mul_ps(u_4, z_4);
127 mv_4 = _mm_mul_ps(v_4, z_4);
128 mv_4 = _mm_mul_ps(mv_4, texScale_4);
130 __m128i mui_4, mvi_4;
131 mui_4 = _mm_cvtps_epi32(mu_4);
132 mvi_4 = _mm_cvtps_epi32(mv_4);
133 mui_4 = _mm_and_si128( mui_4, texMaskU_4);
134 mvi_4 = _mm_and_si128( mvi_4, texMaskV_4);
135 mui_4 = _mm_add_epi32(mui_4, mvi_4);
137 __m128i zv = _mm_set1_epi32(0);
139 fp = _mm_loadl_epi64((__m128i *) p);
140 tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[0]]);
141 fp = _mm_unpacklo_epi8(fp, zv);
142 tp = _mm_unpacklo_epi8(tp, zv);
145 ap = _mm_shufflelo_epi16(tp, 0xFF);
146 ap = _mm_sub_epi16(sc, ap);
148 tp = _mm_mullo_epi16(tp, color_4);
149 fp = _mm_mullo_epi16(fp, ap);
150 tp = _mm_adds_epu16(tp, fp);
151 tp = _mm_srli_epi16(tp, 8);
153 tp = _mm_packus_epi16(tp, zv);
154 *p++ = _mm_cvtsi128_si32(tp);
157 fp = _mm_loadl_epi64((__m128i *) p);
158 tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[1]]);
159 fp = _mm_unpacklo_epi8(fp, zv);
160 tp = _mm_unpacklo_epi8(tp, zv);
162 ap = _mm_shufflelo_epi16(tp, 0xFF);
163 ap = _mm_sub_epi16(sc, ap);
165 tp = _mm_mullo_epi16(tp, color_4);
166 fp = _mm_mullo_epi16(fp, ap);
167 tp = _mm_adds_epu16(tp, fp);
168 tp = _mm_srli_epi16(tp, 8);
170 tp = _mm_packus_epi16(tp, zv);
171 *p++ = _mm_cvtsi128_si32(tp);
174 fp = _mm_loadl_epi64((__m128i *) p);
175 tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[2]]);
176 fp = _mm_unpacklo_epi8(fp, zv);
177 tp = _mm_unpacklo_epi8(tp, zv);
179 ap = _mm_shufflelo_epi16(tp, 0xFF);
180 ap = _mm_sub_epi16(sc, ap);
182 tp = _mm_mullo_epi16(tp, color_4);
183 fp = _mm_mullo_epi16(fp, ap);
184 tp = _mm_adds_epu16(tp, fp);
185 tp = _mm_srli_epi16(tp, 8);
187 tp = _mm_packus_epi16(tp, zv);
188 *p++ = _mm_cvtsi128_si32(tp);
int tx
Definition: bitmap.h:81
LeBitmap frame
Definition: rasterizer_float.h:61
Represent an RGBA color.
Definition: color.h:42