le3d - LightEngine 3D
A straightforward C++ 3D software engine for real-time graphics
flattexalphazc.h
Go to the documentation of this file.
1 
33 inline void LeRasterizer::fillFlatTexAlphaZC(int y, float x1, float x2, float w1, float w2, float u1, float u2, float v1, float v2)
34 {
35  float d = x2 - x1;
36  if (d == 0.0f) return;
37 
38  float id = 1.0f / d;
39  float au = (u2 - u1) * id;
40  float av = (v2 - v1) * id;
41  float aw = (w2 - w1) * id;
42 
43  __m128 u_4 = _mm_set_ps(u1 + 3.0f * au, u1 + 2.0f * au, u1 + au, u1);
44  __m128 v_4 = _mm_set_ps(v1 + 3.0f * av, v1 + 2.0f * av, v1 + av, v1);
45  __m128 w_4 = _mm_set_ps(w1 + 3.0f * aw, w1 + 2.0f * aw, w1 + aw, w1);
46 
47  __m128 au_4 = _mm_set1_ps(au * 4.0f);
48  __m128 av_4 = _mm_set1_ps(av * 4.0f);
49  __m128 aw_4 = _mm_set1_ps(aw * 4.0f);
50 
51  int xb = (int)(x1);
52  int xe = (int)(x2 + 1.9999f);
53  if (xe > frame.tx) xe = frame.tx;
54 
55  LeColor * p = xb + ((int) y) * frame.tx + pixels;
56  int b = (xe - xb) >> 2;
57  int r = (xe - xb) & 0x3;
58 
59  __m128i sc = _mm_set1_epi32(0x01000100);
60  for (int x = 0; x < b; x ++) {
61  __m128 z_4 = _mm_rcp_ps(w_4);
62 
63  __m128 mu_4, mv_4;
64  mu_4 = _mm_mul_ps(u_4, z_4);
65  mv_4 = _mm_mul_ps(v_4, z_4);
66  mv_4 = _mm_mul_ps(mv_4, texScale_4);
67 
68  __m128i mui_4, mvi_4;
69  mui_4 = _mm_cvtps_epi32(mu_4);
70  mvi_4 = _mm_cvtps_epi32(mv_4);
71  mui_4 = _mm_and_si128(mui_4, texMaskU_4);
72  mvi_4 = _mm_and_si128(mvi_4, texMaskV_4);
73  mui_4 = _mm_add_epi32(mui_4, mvi_4);
74 
75  __m128i zv = _mm_set1_epi32(0);
76  __m128i tp, tq, fp, t1, t2;
77  __m128i ap, apl, aph;
78 
79  fp = _mm_loadl_epi64((__m128i *) p);
80  tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[0]]);
81  tq = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[1]]);
82  t1 = _mm_unpacklo_epi32(tp, tq);
83  fp = _mm_unpacklo_epi8(fp, zv);
84  t1 = _mm_unpacklo_epi8(t1, zv);
85 
86  apl = _mm_shufflelo_epi16(t1, 0xFF);
87  aph = _mm_shufflehi_epi16(t1, 0xFF);
88  ap = _mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(aph), _mm_castsi128_pd(apl)));
89  ap = _mm_sub_epi16(sc, ap);
90 
91  t1 = _mm_mullo_epi16(t1, color_4);
92  fp = _mm_mullo_epi16(fp, ap);
93  t1 = _mm_adds_epu16(t1, fp);
94  t1 = _mm_srli_epi16(t1, 8);
95 
96  fp = _mm_loadl_epi64((__m128i *) (p+2));
97  tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[2]]);
98  tq = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[3]]);
99  t2 = _mm_unpacklo_epi32(tp, tq);
100  fp = _mm_unpacklo_epi8(fp, zv);
101  t2 = _mm_unpacklo_epi8(t2, zv);
102 
103  apl = _mm_shufflelo_epi16(t2, 0xFF);
104  aph = _mm_shufflehi_epi16(t2, 0xFF);
105  ap = _mm_castpd_si128(_mm_move_sd(_mm_castsi128_pd(aph), _mm_castsi128_pd(apl)));
106  ap = _mm_sub_epi16(sc, ap);
107 
108  t2 = _mm_mullo_epi16(t2, color_4);
109  fp = _mm_mullo_epi16(fp, ap);
110  t2 = _mm_adds_epu16(t2, fp);
111  t2 = _mm_srli_epi16(t2, 8);
112  tp = _mm_packus_epi16(t1, t2);
113  _mm_storeu_si128((__m128i *) p, tp);
114  p += 4;
115 
116  w_4 = _mm_add_ps(w_4, aw_4);
117  u_4 = _mm_add_ps(u_4, au_4);
118  v_4 = _mm_add_ps(v_4, av_4);
119  }
120 
121  if (r == 0) return;
122 
123  __m128 z_4 = _mm_rcp_ps(w_4);
124 
125  __m128 mu_4, mv_4;
126  mu_4 = _mm_mul_ps(u_4, z_4);
127  mv_4 = _mm_mul_ps(v_4, z_4);
128  mv_4 = _mm_mul_ps(mv_4, texScale_4);
129 
130  __m128i mui_4, mvi_4;
131  mui_4 = _mm_cvtps_epi32(mu_4);
132  mvi_4 = _mm_cvtps_epi32(mv_4);
133  mui_4 = _mm_and_si128( mui_4, texMaskU_4);
134  mvi_4 = _mm_and_si128( mvi_4, texMaskV_4);
135  mui_4 = _mm_add_epi32(mui_4, mvi_4);
136 
137  __m128i zv = _mm_set1_epi32(0);
138  __m128i tp, fp;
139  fp = _mm_loadl_epi64((__m128i *) p);
140  tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[0]]);
141  fp = _mm_unpacklo_epi8(fp, zv);
142  tp = _mm_unpacklo_epi8(tp, zv);
143 
144  __m128i ap;
145  ap = _mm_shufflelo_epi16(tp, 0xFF);
146  ap = _mm_sub_epi16(sc, ap);
147 
148  tp = _mm_mullo_epi16(tp, color_4);
149  fp = _mm_mullo_epi16(fp, ap);
150  tp = _mm_adds_epu16(tp, fp);
151  tp = _mm_srli_epi16(tp, 8);
152 
153  tp = _mm_packus_epi16(tp, zv);
154  *p++ = _mm_cvtsi128_si32(tp);
155 
156  if (r == 1) return;
157  fp = _mm_loadl_epi64((__m128i *) p);
158  tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[1]]);
159  fp = _mm_unpacklo_epi8(fp, zv);
160  tp = _mm_unpacklo_epi8(tp, zv);
161 
162  ap = _mm_shufflelo_epi16(tp, 0xFF);
163  ap = _mm_sub_epi16(sc, ap);
164 
165  tp = _mm_mullo_epi16(tp, color_4);
166  fp = _mm_mullo_epi16(fp, ap);
167  tp = _mm_adds_epu16(tp, fp);
168  tp = _mm_srli_epi16(tp, 8);
169 
170  tp = _mm_packus_epi16(tp, zv);
171  *p++ = _mm_cvtsi128_si32(tp);
172 
173  if (r == 2) return;
174  fp = _mm_loadl_epi64((__m128i *) p);
175  tp = _mm_loadl_epi64((__m128i *) &texDiffusePixels[((uint32_t *)&mui_4)[2]]);
176  fp = _mm_unpacklo_epi8(fp, zv);
177  tp = _mm_unpacklo_epi8(tp, zv);
178 
179  ap = _mm_shufflelo_epi16(tp, 0xFF);
180  ap = _mm_sub_epi16(sc, ap);
181 
182  tp = _mm_mullo_epi16(tp, color_4);
183  fp = _mm_mullo_epi16(fp, ap);
184  tp = _mm_adds_epu16(tp, fp);
185  tp = _mm_srli_epi16(tp, 8);
186 
187  tp = _mm_packus_epi16(tp, zv);
188  *p++ = _mm_cvtsi128_si32(tp);
189 }
int tx
Definition: bitmap.h:81
LeBitmap frame
Definition: rasterizer_float.h:61
Represent an RGBA color.
Definition: color.h:42