1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
| ; Copyright (c) Microsoft Corporation. All rights reserved.
|
|
| ; Custom Build Step, including a listing file placed in intermediate directory
| ; but without Source Browser information
| ; debug:
| ; ml -c -Zi "-Fl$(IntDir)\$(InputName).lst" "-Fo$(IntDir)\$(InputName).obj" "$(InputPath)"
| ; release:
| ; ml -c "-Fl$(IntDir)\$(InputName).lst" "-Fo$(IntDir)\$(InputName).obj" "$(InputPath)"
| ; outputs:
| ; $(IntDir)\$(InputName).obj
|
| ; Custom Build Step, including a listing file placed in intermediate directory
| ; and Source Browser information also placed in intermediate directory
| ; debug:
| ; ml -c -Zi "-Fl$(IntDir)\$(InputName).lst" "-FR$(IntDir)\$(InputName).sbr" "-Fo$(IntDir)\$(InputName).obj" "$(InputPath)"
| ; release:
| ; ml -c "-Fl$(IntDir)\$(InputName).lst" "-FR$(IntDir)\$(InputName).sbr" "-Fo$(IntDir)\$(InputName).obj" "$(InputPath)"
| ; outputs:
| ; $(IntDir)\$(InputName).obj
| ; $(IntDir)\$(InputName).sbr
|
| ;.386
|
| ;.MODEL FLAT, C
|
| PBYTE TYPEDEF PTR BYTE
| PWORD TYPEDEF PTR WORD
| PDWORD TYPEDEF PTR DWORD
|
| .data
|
|
| .code
|
| ; Euclid's algorithm finds the greatest common divisor by repeatedly
| ; subtracting the smaller number from the larger number until zero
| ; is reached. The number remaining is the greatest common divisor.
|
| ImgAddr equ r14
|
| ByteAddr equ ecx
| ShortAddr equ edx
| nByte equ r8
|
| iByte equ rsi
|
| Byte_To_Short PROC
|
| mov iByte, 16
| pxor xmm0, xmm0
|
| cmp nByte, 16
| jl START_EXTRA
|
| LOOP_SIMD:
|
| movdqu xmm1, [ByteAddr]
| movdqa xmm2, xmm1
|
| punpcklbw xmm1, xmm0
| punpckhbw xmm2, xmm0
|
| movntdq [ShortAddr], xmm1
| movntdq [ShortAddr+16], xmm2
|
| add ByteAddr, 16
| add ShortAddr, 32
|
| add iByte, 16
| cmp iByte, nByte
| jle LOOP_SIMD
| sub iByte, 16
|
| START_EXTRA:
| xor rax, rax
|
| LOOP_EXTRA:
|
| mov al, [ByteAddr]
| mov [ShortAddr], ax
|
|
| inc ByteAddr
| add ShortAddr, 2
| inc iByte
|
| cmp iByte, nByte
| jl LOOP_EXTRA
|
|
| RET
| Byte_To_Short ENDP
|
|
|
|
| ;==========================================================
|
| iPitch equ rcx
| SPR equ rdx
| fSPR equ r8
| sSPR equ r9
|
| InspLeft equ qword ptr[rbp+ 30h]
| InspRight equ qword ptr[rbp+ 38h]
| InspTop equ qword ptr[rbp+ 40h]
| InspBottom equ qword ptr[rbp+ 48h]
|
| pData equ qword ptr[rbp+ 50h]
| BuffWidth equ qword ptr[rbp+ 58h]
|
| ix equ rsi
| iy equ rdi
|
|
| xxSrc equ xmm4
| xxSrc2 equ xmm5
| xxCmp equ xmm6
| xxCmp2 equ xmm7
| xxSpr equ xmm8
| xxOne equ xmm9
| xxfSpr equ xmm10
| xxsSpr equ xmm11
| xxMinus equ xmm12
| xxMinus2 equ xmm13
| xxRslt equ xmm14
|
| rrImgAddr equ r15
| rrPitch equ r10
| rrPitch2 equ r11
|
|
|
|
| CheckPitchAsm PROC
| local ImgAddrLine : qword
| local ImgLineEnd : qword
| local ImgLineLength : qword
| local Result64[2] : qword
|
| mov rrPitch, iPitch
| mov rrPitch2, rrPitch
| add rrPitch2, 1h
|
|
| mov rax, 1
| movd xxOne, rax ; 1·Î ä¿î´Ù.(word)
| movdqa xmm0, xxOne
| punpcklwd xmm0, xxOne
| pshufd xxOne, xmm0, 0
|
| mov rax, SPR
| movd xxSpr, rax
| movdqa xmm0, xxSpr
| punpcklwd xmm0, xxSpr
| pshufd xxSpr, xmm0, 0
|
| mov rax, fSpr
| movd xxfSpr, rax
| movdqa xmm0, xxfSpr
| punpcklwd xmm0, xxfSpr
| pshufd xxfSpr, xmm0, 0
|
| movdqa xxsSpr, xxSpr
| psubw xxsSpr, xxfSpr
|
|
|
| ; À̹ÌÁö ÁÖ¼Ò ¸¸µé±â
| mov rrImgAddr, pData
| mov rax, BuffWidth ; ImgAddr= ImgAddr+ BuffWidth*InspTop + InspLeft;
| mul InspTop
| add rax, InspLeft
| add rrImgAddr, rax
| mov ImgAddrLine, rrImgAddr
|
| mov rax, InspRight
| sub rax, InspLeft
| mov ImgLineLength, rax
|
| add rax, rrImgAddr
| mov ImgLineEnd, rax
|
| mov iy, InspTop
| pxor xxRslt, xxRslt
| pxor xmm0, xmm0
|
| Cmp_16:
| ; 1. Src Buffer
| movdqu xxSrc, [rrImgAddr] ; Load
| movdqa xxSrc2, xxSrc
| punpcklbw xxSrc, xmm0 ; Unpack
| punpckhbw xxSrc2, xmm0
|
| movdqu xxMinus, [rrImgAddr+ 1] ; Load
| movdqa xxMinus2, xxMinus
| punpcklbw xxMinus, xmm0 ; Unpack
| punpckhbw xxMinus2, xmm0
|
| paddw xxSrc, xxMinus
| paddw xxSrc2, xxMinus2
| pmullw xxSrc, xxSpr ; SPR
| pmullw xxSrc2, xxSpr
|
|
| ; 2. Compare Buffer
| movdqu xxCmp, [rrImgAddr+ rrPitch] ; Load
| movdqa xxCmp2, xxCmp
| punpcklbw xxCmp, xmm0 ; Unpack
| punpckhbw xxCmp2, xmm0
| pmullw xxCmp, xxfSpr ; fSpr
| pmullw xxCmp2, xxfSpr
|
| psubw xxSrc, xxCmp ; SPR - fSpr
| psubw xxSrc2, xxCmp2
|
| movdqu xxCmp, [rrImgAddr+ rrPitch2]
| movdqa xxCmp2, xxCmp
| punpcklbw xxCmp, xmm0
| punpckhbw xxCmp2, xmm0
| pmullw xxCmp, xxSpr
| pmullw xxCmp2, xxSpr
|
| psubw xxSrc, xxCmp
| psubw xxSrc2, xxCmp2
|
| movdqu xxCmp, [rrImgAddr+ rrPitch2+ 1] ; Load
| movdqa xxCmp2, xxCmp
| punpcklbw xxCmp, xmm0 ; Unpack
| punpckhbw xxCmp2, xmm0
| pmullw xxCmp, xxsSpr ; sSpr
| pmullw xxCmp2, xxsSpr
|
| psubw xxSrc, xxCmp ; (SPR- fSpr) - sSpr
| psubw xxSrc2, xxCmp2
|
|
| pabsw xxSrc, xxSrc ; Àý´ë°ª
| psubusw xxSrc, xxOne
| pmaddwd xxSrc, xxOne ; multiply and add ( 16 -> 32)
| pabsw xxSrc2, xxSrc2 ; Àý´ë°ª
| psubusw xxSrc2, xxOne
| pmaddwd xxSrc2, xxOne ; multiply and add ( 16 -> 32)
|
| movdqa xxCmp, xxSrc
| punpckldq xxCmp, xmm0 ; unpack 32 -> 64
| punpckhdq xxSrc, xmm0
|
| paddq xxRslt, xxSrc ; add to Result(64)
| paddq xxRslt, xxCmp ; add to Result(64)
|
| movdqa xxCmp, xxSrc2
| punpckldq xxCmp, xmm0 ; unpack 32 -> 64
| punpckhdq xxSrc2, xmm0
|
| paddq xxRslt, xxSrc2 ; add to Result(64)
| paddq xxRslt, xxCmp ; add to Result(64)
|
|
|
| add rrImgAddr, 10h
| cmp rrImgAddr, ImgLineEnd
| jl Cmp_16
|
| inc iy
| cmp iy, InspBottom
| jge End_Buff
|
|
| mov rax, ImgAddrLine
| add rax, BuffWidth
| mov ImgAddrLine, rax
| mov rrImgAddr, rax
|
| add rax, ImgLineLength
| mov ImgLineEnd, rax
|
| jmp Cmp_16
|
|
| End_Buff:
|
| lea esi, Result64
| movdqu [esi], xxRslt
| mov rax, [esi]
| mov rcx, [esi+8h]
| add rax, rcx
|
| RET
| CheckPitchAsm ENDP
| end
|
|