libmad-0.15.1b-optimization.patch.avr32 83 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922
  1. diff --git a/bit.c b/bit.c
  2. index c2bfb24..262ce3a 100644
  3. --- a/bit.c
  4. +++ b/bit.c
  5. @@ -25,12 +25,6 @@
  6. # include "global.h"
  7. -# ifdef HAVE_LIMITS_H
  8. -# include <limits.h>
  9. -# else
  10. -# define CHAR_BIT 8
  11. -# endif
  12. -
  13. # include "bit.h"
  14. /*
  15. @@ -81,6 +75,8 @@ unsigned short const crc_table[256] = {
  16. # define CRC_POLY 0x8005
  17. +#ifndef FPM_AVR32
  18. +
  19. /*
  20. * NAME: bit->init()
  21. * DESCRIPTION: initialize bit pointer struct
  22. @@ -190,6 +186,8 @@ void mad_bit_write(struct mad_bitptr *bitptr, unsigned int len,
  23. }
  24. # endif
  25. +#endif
  26. +
  27. /*
  28. * NAME: bit->crc()
  29. * DESCRIPTION: compute CRC-check word
  30. diff --git a/bit.h b/bit.h
  31. index 5a51570..70f550a 100644
  32. --- a/bit.h
  33. +++ b/bit.h
  34. @@ -22,6 +22,92 @@
  35. # ifndef LIBMAD_BIT_H
  36. # define LIBMAD_BIT_H
  37. +# ifdef HAVE_LIMITS_H
  38. +# include <limits.h>
  39. +# else
  40. +# define CHAR_BIT 8
  41. +# endif
  42. +
  43. +#ifdef FPM_AVR32
  44. +
  45. +struct mad_bitptr {
  46. + unsigned char const *byte;
  47. + unsigned int read_bytes;
  48. +};
  49. +
  50. +/*
  51. + * NAME: bit->init()
  52. + * DESCRIPTION: initialize bit pointer struct
  53. + */
  54. +static void mad_bit_init(struct mad_bitptr *bitptr, unsigned char const *byte)
  55. +{
  56. + bitptr->byte = byte;
  57. + bitptr->read_bytes = 0;
  58. +}
  59. +
  60. +/*
  61. + * NAME: bit->length()
  62. + * DESCRIPTION: return number of bits between start and end points
  63. + */
  64. +static unsigned int mad_bit_length(struct mad_bitptr const *begin,
  65. + struct mad_bitptr const *end)
  66. +{
  67. + return (end->read_bytes - begin->read_bytes) +
  68. + 8 * (end->byte - begin->byte);
  69. +}
  70. +
  71. +/*
  72. + * NAME: bit->nextbyte()
  73. + * DESCRIPTION: return pointer to next unprocessed byte
  74. + */
  75. +static unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *bitptr)
  76. +{
  77. + return bitptr->byte + ((bitptr->read_bytes + 0x7) >> 3);
  78. +}
  79. +
  80. +/*
  81. + * NAME: bit->skip()
  82. + * DESCRIPTION: advance bit pointer
  83. + */
  84. +static void mad_bit_skip(struct mad_bitptr *bitptr, unsigned int len)
  85. +{
  86. + bitptr->read_bytes += len;
  87. + bitptr->byte += (bitptr->read_bytes >> 3);
  88. + bitptr->read_bytes &= 0x7;
  89. +}
  90. +
  91. +/*
  92. + * NAME: bit->read()
  93. + * DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value
  94. + */
  95. +static unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len)
  96. +{
  97. + register unsigned long value;
  98. +
  99. + if (!len)
  100. + return 0;
  101. +
  102. + value = *(unsigned int *)bitptr->byte;
  103. +
  104. + value <<= bitptr->read_bytes;
  105. + value >>= (32 - len);
  106. +
  107. + bitptr->read_bytes += len;
  108. + bitptr->byte += (bitptr->read_bytes >> 3);
  109. + bitptr->read_bytes &= 0x7;
  110. +
  111. + return value;
  112. +}
  113. +
  114. +# define mad_bit_finish(bitptr) /* nothing */
  115. +
  116. +static unsigned long mad_bit_bitsleft(struct mad_bitptr *bitptr)
  117. +{
  118. + return (8 - (bitptr)->read_bytes);
  119. +}
  120. +
  121. +#else /* #ifdef FPM_AVR32 */
  122. +
  123. struct mad_bitptr {
  124. unsigned char const *byte;
  125. unsigned short cache;
  126. @@ -42,6 +128,8 @@ void mad_bit_skip(struct mad_bitptr *, unsigned int);
  127. unsigned long mad_bit_read(struct mad_bitptr *, unsigned int);
  128. void mad_bit_write(struct mad_bitptr *, unsigned int, unsigned long);
  129. +#endif
  130. +
  131. unsigned short mad_bit_crc(struct mad_bitptr, unsigned int, unsigned short);
  132. # endif
  133. diff --git a/configure.ac b/configure.ac
  134. index 9b79399..063cb9b 100644
  135. --- a/configure.ac
  136. +++ b/configure.ac
  137. @@ -274,13 +274,14 @@ fi
  138. AC_MSG_CHECKING(for architecture-specific fixed-point math routines)
  139. AC_ARG_ENABLE(fpm, AC_HELP_STRING([--enable-fpm=ARCH],
  140. [use ARCH-specific fixed-point math routines
  141. - (one of: intel, arm, mips, sparc, ppc, 64bit, default)]),
  142. + (one of: intel, arm, avr32, mips, sparc, ppc, 64bit, default)]),
  143. [
  144. case "$enableval" in
  145. yes) ;;
  146. no|default|approx) FPM="DEFAULT" ;;
  147. intel|i?86) FPM="INTEL" ;;
  148. arm) FPM="ARM" ;;
  149. + avr32) FPM="AVR32" ;;
  150. mips) FPM="MIPS" ;;
  151. sparc) FPM="SPARC" ;;
  152. ppc|powerpc) FPM="PPC" ;;
  153. @@ -298,6 +299,7 @@ then
  154. case "$host" in
  155. i?86-*) FPM="INTEL" ;;
  156. arm*-*) FPM="ARM" ;;
  157. + avr32*-*) FPM="AVR32" ;;
  158. mips*-*) FPM="MIPS" ;;
  159. sparc*-*) FPM="SPARC" ;;
  160. powerpc*-*) FPM="PPC" ;;
  161. @@ -343,6 +345,11 @@ then
  162. ASO="$ASO -DASO_IMDCT"
  163. ASO_OBJS="imdct_l_arm.lo"
  164. ;;
  165. + avr32*-*)
  166. + ASO="$ASO -DASO_INTERLEAVE2"
  167. + ASO="$ASO -DASO_ZEROCHECK"
  168. + ASO_OBJS="dct32_avr32.lo synth_avr32.lo imdct_avr32.lo"
  169. + ;;
  170. mips*-*)
  171. ASO="$ASO -DASO_INTERLEAVE2"
  172. ASO="$ASO -DASO_ZEROCHECK"
  173. diff --git a/configure b/configure
  174. index ee421cc..7a9f0c8 100755
  175. --- a/configure
  176. +++ b/configure
  177. @@ -1048,7 +1048,7 @@ Optional Features:
  178. --enable-speed optimize for speed over accuracy
  179. --enable-accuracy optimize for accuracy over speed
  180. --enable-fpm=ARCH use ARCH-specific fixed-point math routines (one of:
  181. - intel, arm, mips, sparc, ppc, 64bit, default)
  182. + intel, arm, avr32, mips, sparc, ppc, 64bit, default)
  183. --enable-sso use subband synthesis optimization
  184. --disable-aso disable architecture-specific optimizations
  185. --enable-strict-iso use strict ISO/IEC interpretations
  186. @@ -21477,6 +21477,7 @@ if test "${enable_fpm+set}" = set; then
  187. no|default|approx) FPM="DEFAULT" ;;
  188. intel|i?86) FPM="INTEL" ;;
  189. arm) FPM="ARM" ;;
  190. + avr32) FPM="AVR32" ;;
  191. mips) FPM="MIPS" ;;
  192. sparc) FPM="SPARC" ;;
  193. ppc|powerpc) FPM="PPC" ;;
  194. @@ -21498,6 +21499,7 @@ then
  195. case "$host" in
  196. i?86-*) FPM="INTEL" ;;
  197. arm*-*) FPM="ARM" ;;
  198. + avr32*-*) FPM="AVR32" ;;
  199. mips*-*) FPM="MIPS" ;;
  200. sparc*-*) FPM="SPARC" ;;
  201. powerpc*-*) FPM="PPC" ;;
  202. @@ -21554,6 +21556,11 @@ then
  203. ASO="$ASO -DASO_IMDCT"
  204. ASO_OBJS="imdct_l_arm.lo"
  205. ;;
  206. + avr32*-*)
  207. + ASO="$ASO -DASO_INTERLEAVE2"
  208. + ASO="$ASO -DASO_ZEROCHECK"
  209. + ASO_OBJS="dct32_avr32.lo synth_avr32.lo imdct_avr32.lo"
  210. + ;;
  211. mips*-*)
  212. ASO="$ASO -DASO_INTERLEAVE2"
  213. ASO="$ASO -DASO_ZEROCHECK"
  214. diff --git a/dct32_avr32.S b/dct32_avr32.S
  215. new file mode 100644
  216. index 0000000..7513340
  217. --- /dev/null
  218. +++ b/dct32_avr32.S
  219. @@ -0,0 +1,780 @@
  220. +/*
  221. + Optimized 32-point Discrete Cosine Transform (DCT)
  222. + Copyright 2003-2006 Atmel Corporation.
  223. +
  224. + Written by Ronny Pedersen, Atmel Norway
  225. +
  226. + This program is free software; you can redistribute it and/or modify
  227. + it under the terms of the GNU General Public License as published by
  228. + the Free Software Foundation; either version 2 of the License, or
  229. + (at your option) any later version.
  230. +
  231. + This program is distributed in the hope that it will be useful,
  232. + but WITHOUT ANY WARRANTY; without even the implied warranty of
  233. + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  234. + GNU General Public License for more details.
  235. +
  236. + You should have received a copy of the GNU General Public License
  237. + along with this program; if not, write to the Free Software
  238. + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
  239. +
  240. +#define SHIFT 12
  241. +#define MAD_F_SCALEBITS 28
  242. +#define SLOTS 8
  243. +
  244. +#define MAD_F(x) ((x + (1 << 15)) >> 16)
  245. +
  246. +# define costab1 MAD_F(0x7fd8878e)
  247. +# define costab2 MAD_F(0x7f62368f)
  248. +# define costab3 MAD_F(0x7e9d55fc)
  249. +# define costab4 MAD_F(0x7d8a5f40)
  250. +# define costab5 MAD_F(0x7c29fbee)
  251. +# define costab6 MAD_F(0x7a7d055b)
  252. +# define costab7 MAD_F(0x78848414)
  253. +# define costab8 MAD_F(0x7641af3d)
  254. +# define costab9 MAD_F(0x73b5ebd1)
  255. +# define costab10 MAD_F(0x70e2cbc6)
  256. +# define costab11 MAD_F(0x6dca0d14)
  257. +# define costab12 MAD_F(0x6a6d98a4)
  258. +# define costab13 MAD_F(0x66cf8120)
  259. +# define costab14 MAD_F(0x62f201ac)
  260. +# define costab15 MAD_F(0x5ed77c8a)
  261. +# define costab16 MAD_F(0x5a82799a)
  262. +# define costab17 MAD_F(0x55f5a4d2)
  263. +# define costab18 MAD_F(0x5133cc94)
  264. +# define costab19 MAD_F(0x4c3fdff4)
  265. +# define costab20 MAD_F(0x471cece7)
  266. +# define costab21 MAD_F(0x41ce1e65)
  267. +# define costab22 MAD_F(0x3c56ba70)
  268. +# define costab23 MAD_F(0x36ba2014)
  269. +# define costab24 MAD_F(0x30fbc54d)
  270. +# define costab25 MAD_F(0x2b1f34eb)
  271. +# define costab26 MAD_F(0x25280c5e)
  272. +# define costab27 MAD_F(0x1f19f97b)
  273. +# define costab28 MAD_F(0x18f8b83c)
  274. +# define costab29 MAD_F(0x12c8106f)
  275. +# define costab30 MAD_F(0x0c8bd35e)
  276. +# define costab31 MAD_F(0x0647d97c)
  277. +
  278. +
  279. + .macro butterfly2_in out1, out2, out3, out4, in, idx_in1, idx_in2, idx_in3, idx_in4, coeff1, coeff2, tmplo, tmphi
  280. + mov \tmplo, \coeff1
  281. + ld.w \out1, \in[\idx_in1 * 4]
  282. + ld.w \out2, \in[\idx_in2 * 4]
  283. + ld.w \out3, \in[\idx_in3 * 4]
  284. + ld.w \out4, \in[\idx_in4 * 4]
  285. + sub \tmphi, \out1, \out2
  286. + add \out1, \out2
  287. + mulsatrndwh.w \out2, \tmphi, \tmplo:b
  288. +
  289. + sub \tmphi, \out3, \out4
  290. + mov \tmplo, \coeff2
  291. + add \out3, \out4
  292. + mulsatrndwh.w \out4, \tmphi, \tmplo:b
  293. + .endm
  294. +
  295. + .macro butterfly2 in1, in2, in3, in4, coeff1, tmplo, tmphi, tmp
  296. + mov \tmp, \coeff1
  297. + sub \tmphi, \in1, \in2
  298. + add \in1, \in2
  299. + mulsatrndwh.w \in2, \tmphi, \tmp:b
  300. +
  301. + sub \tmphi, \in3, \in4
  302. + add \in3, \in4
  303. + mulsatrndwh.w \in4, \tmphi, \tmp:b
  304. + .endm
  305. +
  306. + .macro butterfly4 in1, in2, in3, in4, in5, in6, in7, in8, coeff1, tmplo, tmphi, tmp
  307. + mov \tmp, \coeff1
  308. + sub \tmphi, \in1, \in2
  309. + add \in1, \in2
  310. + mulsatrndwh.w \in2, \tmphi, \tmp:b
  311. +
  312. + sub \tmphi, \in3, \in4
  313. + add \in3, \in4
  314. + mulsatrndwh.w \in4, \tmphi, \tmp:b
  315. +
  316. + sub \tmphi, \in5, \in6
  317. + add \in5, \in6
  318. + mulsatrndwh.w \in6, \tmphi, \tmp:b
  319. +
  320. + sub \tmphi, \in7, \in8
  321. + add \in7, \in8
  322. + mulsatrndwh.w \in8, \tmphi, \tmp:b
  323. + .endm
  324. +
  325. + .macro scale reg
  326. + .endm
  327. +
  328. +/*void dct32( mad_fixed_t const in[32], unsigned int slot,
  329. + mad_fixed_t lo[16][8], mad_fixed_t hi[16][8]) */
  330. +
  331. + .global dct32_avr32
  332. +dct32_avr32:
  333. + stm --sp, r0-r7, r9-r11, lr
  334. +
  335. + sub sp, 32*4
  336. +
  337. +/* t0 = in[0] + in[31]; t16 = MUL(in[0] - in[31], costab1);
  338. + t1 = in[15] + in[16]; t17 = MUL(in[15] - in[16], costab31); */
  339. + butterfly2_in r4/*t0*/, r5/*t16*/, r6/*t1*/, r7/*t17*/, r12, 0, 31, 15, 16, costab1, costab31, r10, r11
  340. +
  341. +/* t41 = t16 + t17;
  342. + t59 = MUL(t16 - t17, costab2);
  343. + t33 = t0 + t1;
  344. + t50 = MUL(t0 - t1, costab2);*/
  345. + butterfly2 r5/*t41*/, r7/*t59*/, r4/*t33*/, r6/*t50*/, costab2, r10, r11, lr
  346. +
  347. +/* t2 = in[7] + in[24]; t18 = MUL(in[7] - in[24], costab15);
  348. + t3 = in[8] + in[23]; t19 = MUL(in[8] - in[23], costab17); */
  349. + butterfly2_in r0/*t2*/, r1/*t18*/, r2/*t3*/, r3/*t19*/, r12, 7, 24, 8, 23, costab15, costab17, r10, r11
  350. +
  351. +/* t42 = t18 + t19;
  352. + t60 = MUL(t18 - t19, costab30);
  353. + t34 = t2 + t3;
  354. + t51 = MUL(t2 - t3, costab30); */
  355. + butterfly2 r1/*t42*/, r3/*t60*/, r0/*t34*/, r2/*t51*/, costab30, r10, r11, lr
  356. +
  357. +/* t73 = t41 + t42; t94 = MUL(t41 - t42, costab4);
  358. + t83 = t59 + t60; t106 = MUL(t59 - t60, costab4); */
  359. +
  360. +
  361. +/* t69 = t33 + t34; t89 = MUL(t33 - t34, costab4);
  362. + t78 = t50 + t51; t100 = MUL(t50 - t51, costab4); */
  363. + butterfly4 r5/*t73*/, r1/*t94*/, r7/*t83*/, r3/*t106*/,r4/*t69*/, r0/*t89*/, r6/*t78*/, r2/*t100*/, costab4, r10, r11, lr
  364. +
  365. +/* Store away the computed butterflies:
  366. + sp[0-7] = t83, t78, t73, t69, t106, t100, t94, t89 */
  367. + stm sp, r0-r7
  368. +
  369. +
  370. +/* t4 = in[3] + in[28]; t20 = MUL(in[3] - in[28], costab7);
  371. + t5 = in[12] + in[19]; t21 = MUL(in[12] - in[19], costab25); */
  372. + butterfly2_in r4/*t4*/, r5/*t20*/, r6/*t5*/, r7/*t21*/, r12, 3, 28, 12, 19, costab7, costab25, r10, r11
  373. +
  374. +/* t43 = t20 + t21;
  375. + t61 = MUL(t20 - t21, costab14);
  376. + t35 = t4 + t5;
  377. + t52 = MUL(t4 - t5, costab14); */
  378. + butterfly2 r5/*t43*/, r7/*t61*/, r4/*t35*/, r6/*t52*/, costab14, r10, r11, lr
  379. +
  380. +/* t6 = in[4] + in[27]; t22 = MUL(in[4] - in[27], costab9);
  381. + t7 = in[11] + in[20]; t23 = MUL(in[11] - in[20], costab23); */
  382. + butterfly2_in r0/*t6*/, r1/*t22*/, r2/*t7*/, r3/*t23*/, r12, 4, 27, 11, 20, costab9, costab23, r10, r11
  383. +
  384. +/* t44 = t22 + t23;
  385. + t62 = MUL(t22 - t23, costab18);
  386. + t36 = t6 + t7;
  387. + t53 = MUL(t6 - t7, costab18); */
  388. + butterfly2 r1/*t44*/, r3/*t62*/, r0/*t36*/, r2/*t53*/, costab18, r10, r11, lr
  389. +
  390. +/* t74 = t43 + t44; t95 = MUL(t43 - t44, costab28);
  391. + t84 = t61 + t62; t107 = MUL(t61 - t62, costab28); */
  392. +
  393. +/* t70 = t35 + t36; t90 = MUL(t35 - t36, costab28);
  394. + t79 = t52 + t53; t101 = MUL(t52 - t53, costab28); */
  395. + butterfly4 r5/*t74*/, r1/*t95*/, r7/*t84*/, r3/*t107*/, r4/*t70*/, r0/*t90*/, r6/*t79*/, r2/*t101*/, costab28, r10, r11, lr
  396. +
  397. +/* Store away the computed butterflies:
  398. + sp[8-15] = t84, t79, t74, t70, t107, t101, t95, t90 */
  399. + sub r10, sp, -8*4
  400. + stm r10, r0-r7
  401. +
  402. +
  403. +/* t8 = in[1] + in[30]; t24 = MUL(in[1] - in[30], costab3);
  404. + t9 = in[14] + in[17]; t25 = MUL(in[14] - in[17], costab29); */
  405. + butterfly2_in r4/*t8*/, r5/*t24*/, r6/*t9*/, r7/*t25*/, r12, 1, 30, 14, 17, costab3, costab29, r10, r11
  406. +
  407. +
  408. +/* t45 = t24 + t25;
  409. + t63 = MUL(t24 - t25, costab6);
  410. + t37 = t8 + t9;
  411. + t54 = MUL(t8 - t9, costab6); */
  412. + butterfly2 r5/*t45*/, r7/*t63*/, r4/*t37*/, r6/*t54*/, costab6, r10, r11, lr
  413. +
  414. +/* t10 = in[6] + in[25]; t26 = MUL(in[6] - in[25], costab13);
  415. + t11 = in[9] + in[22]; t27 = MUL(in[9] - in[22], costab19); */
  416. + butterfly2_in r0/*t10*/, r1/*t26*/, r2/*t11*/, r3/*t27*/, r12, 6, 25, 9, 22, costab13, costab19, r10, r11
  417. +
  418. +/* t46 = t26 + t27;
  419. + t64 = MUL(t26 - t27, costab26);
  420. + t38 = t10 + t11;
  421. + t55 = MUL(t10 - t11, costab26); */
  422. + butterfly2 r1/*t46*/, r3/*t64*/, r0/*t38*/, r2/*t55*/, costab26, r10, r11, lr
  423. +
  424. +/* t75 = t45 + t46; t96 = MUL(t45 - t46, costab12);
  425. + t85 = t63 + t64; t108 = MUL(t63 - t64, costab12); */
  426. +
  427. +/* t71 = t37 + t38; t91 = MUL(t37 - t38, costab12);
  428. + t80 = t54 + t55; t102 = MUL(t54 - t55, costab12); */
  429. + butterfly4 r5/*t75*/, r1/*t96*/, r7/*t85*/, r3/*t108*/, r4/*t71*/, r0/*t91*/, r6/*t80*/, r2/*t102*/, costab12, r10, r11, lr
  430. +
  431. +/* Store away the computed butterflies:
  432. + sp[16-23] = t85, t80, t75, t71, t108, t102, t96, t91 */
  433. + sub r10, sp, -16*4
  434. + stm r10, r0-r7
  435. +
  436. +/* t12 = in[2] + in[29]; t28 = MUL(in[2] - in[29], costab5);
  437. + t13 = in[13] + in[18]; t29 = MUL(in[13] - in[18], costab27); */
  438. + butterfly2_in r4/*t12*/, r5/*t28*/, r6/*t13*/, r7/*t29*/, r12, 2, 29, 13, 18, costab5, costab27, r10, r11
  439. +
  440. +/* t47 = t28 + t29;
  441. + t65 = MUL(t28 - t29, costab10);
  442. + t39 = t12 + t13;
  443. + t56 = MUL(t12 - t13, costab10); */
  444. + butterfly2 r5/*t47*/, r7/*t65*/, r4/*t39*/, r6/*t56*/, costab10, r10, r11, lr
  445. +
  446. +/* t14 = in[5] + in[26]; t30 = MUL(in[5] - in[26], costab11);
  447. + t15 = in[10] + in[21]; t31 = MUL(in[10] - in[21], costab21);*/
  448. + butterfly2_in r0/*t14*/, r1/*t30*/, r2/*t15*/, r3/*t31*/, r12, 5, 26, 10, 21, costab11, costab21, r10, r11
  449. +
  450. +/* t48 = t30 + t31;
  451. + t66 = MUL(t30 - t31, costab22);
  452. + t40 = t14 + t15;
  453. + t57 = MUL(t14 - t15, costab22);*/
  454. + butterfly2 r1/*t48*/, r3/*t66*/, r0/*t40*/, r2/*t57*/, costab22, r10, r11, lr
  455. +
  456. +/* t76 = t47 + t48; t97 = MUL(t47 - t48, costab20);
  457. + t86 = t65 + t66; t109 = MUL(t65 - t66, costab20);*/
  458. +
  459. +/* t72 = t39 + t40; t92 = MUL(t39 - t40, costab20);
  460. + t81 = t56 + t57; t103 = MUL(t56 - t57, costab20);*/
  461. + butterfly4 r5/*t76*/, r1/*t97*/, r7/*t86*/, r3/*t109*/,r4/*t72*/, r0/*t92*/, r6/*t81*/, r2/*t103*/, costab20, r10, r11, lr
  462. +
  463. +/* Store away the computed butterflies:
  464. + sp[24-31] = t86, t81, t76, t72, t109, t103, t97, t92 */
  465. + sub r10, sp, -24*4
  466. + stm r10, r0-r7
  467. +
  468. +/* We now have the following on the stack:
  469. +
  470. + sp[0-7] = t83, t78, t73, t69, t106, t100, t94, t89
  471. + sp[8-15] = t84, t79, t74, t70, t107, t101, t95, t90
  472. + sp[16-23] = t85, t80, t75, t71, t108, t102, t96, t91
  473. + sp[24-31] = t86, t81, t76, t72, t109, t103, t97, t92 */
  474. +
  475. +/* Load {r0...r7} = { t72, t76, t71, t75, t70, t74, t69, t73 } */
  476. + ld.d r6, sp[2*4]
  477. + ld.d r4, sp[10*4]
  478. + ld.d r2, sp[18*4]
  479. + ld.d r0, sp[26*4]
  480. +
  481. +
  482. +/* t113 = t69 + t70;
  483. + t141 = MUL(t69 - t70, costab8);
  484. +
  485. + t115 = t73 + t74;
  486. + t144 = MUL(t73 - t74, costab8); */
  487. + butterfly2 r6/*t113*/, r4/*t141*/, r7/*t115*/, r5/*t144*/, costab8, r10, r11, lr
  488. +
  489. +/* t114 = t71 + t72;
  490. + t142 = MUL(t71 - t72, costab24);
  491. +
  492. + t116 = t75 + t76;
  493. + t145 = MUL(t75 - t76, costab24); */
  494. + butterfly2 r2/*t114*/, r0/*t142*/, r3/*t116*/, r1/*t145*/, costab24, r10, r11, lr
  495. +
  496. +
  497. +/*
  498. + t191 = t113 + t114;
  499. + t192 = MUL(t113 - t114, costab16)
  500. +
  501. + t32 = t115 + t116;
  502. + t177 = MUL(t115 - t116, costab16) ;
  503. +
  504. + t143 = t141 + t142;
  505. + t190 = MUL(t141 - t142, costab16) ;
  506. +
  507. + t146 = t144 + t145;
  508. + t184 = MUL(t144 - t145, costab16) ; */
  509. + butterfly4 r6/*t191*/, r2/*t192*/, r7/*t32*/, r3/*t177*/, r4/*t143*/, r0/*190*/, r5/*t146*/, r1/*t184*/, costab16, r10, r11, lr
  510. +
  511. +/* Store away the computed butterflies:
  512. + sp[2-3] = t32, t191
  513. + sp[10-11] = t146, t143
  514. + sp[18-19] = t177, t192
  515. + sp[26-27] = t184, t190 */
  516. + st.d sp[2*4] , r6
  517. + st.d sp[10*4], r4
  518. + st.d sp[18*4], r2
  519. + st.d sp[26*4], r0
  520. +
  521. +/* Load {r0...r7} = { t81, t86, t80, t85, t79, t84, t78, t83 } */
  522. + ld.d r6, sp[0*4]
  523. + ld.d r4, sp[8*4]
  524. + ld.d r2, sp[16*4]
  525. + ld.d r0, sp[24*4]
  526. +
  527. +
  528. +/* t118 = t78 + t79;
  529. + t148 = MUL(t78 - t79, costab8);
  530. +
  531. + t121 = t83 + t84;
  532. + t152 = MUL(t83 - t84, costab8); */
  533. + butterfly2 r6/*t118*/, r4/*t148*/, r7/*t121*/, r5/*t152*/, costab8, r10, r11, lr
  534. +
  535. +/* t119 = t80 + t81;
  536. + t149 = MUL(t80 - t81, costab24);
  537. +
  538. + t122 = t85 + t86;
  539. + t153 = MUL(t85 - t86, costab24); */
  540. + butterfly2 r2/*t119*/, r0/*t149*/, r3/*t122*/, r1/*t153*/, costab24, r10, r11, lr
  541. +
  542. +
  543. +
  544. +/* t58 = t118 + t119;
  545. + t178 = MUL(t118 - t119, costab16) ;
  546. +
  547. + t67 = t121 + t122;
  548. + t179 = MUL(t121 - t122, costab16) ;
  549. +
  550. + t150 = t148 + t149;
  551. + t185 = MUL(t148 - t149, costab16) ;
  552. +
  553. + t154 = t152 + t153;
  554. + t186 = MUL(t152 - t153, costab16) ; */
  555. + butterfly4 r6/*t58*/, r2/*t178*/, r7/*t67*/, r3/*t179*/, r4/*t150*/, r0/*185*/, r5/*t154*/, r1/*t186*/, costab16, r10, r11, lr
  556. +
  557. +/* Store away the computed butterflies:
  558. + sp[0-1] = t67, t58
  559. + sp[8-9] = t154, t150
  560. + sp[16-17] = t179, t178
  561. + sp[24-25] = t186, t185 */
  562. + st.d sp[0*4] , r6
  563. + st.d sp[8*4], r4
  564. + st.d sp[16*4], r2
  565. + st.d sp[24*4], r0
  566. +
  567. +/* Load {r0...r7} = { t92, t97, t91, t96, t90, t95, t89, t94 } */
  568. + ld.d r6, sp[6*4]
  569. + ld.d r4, sp[14*4]
  570. + ld.d r2, sp[22*4]
  571. + ld.d r0, sp[30*4]
  572. +
  573. +
  574. +/* t125 = t89 + t90;
  575. + t157 = MUL(t89 - t90, costab8);
  576. +
  577. + t128 = t94 + t95;
  578. + t161 = MUL(t94 - t95, costab8); */
  579. + butterfly2 r6/*t125*/, r4/*t157*/, r7/*t128*/, r5/*t161*/, costab8, r10, r11, lr
  580. +
  581. +/* t126 = t91 + t92;
  582. + t158 = MUL(t91 - t92, costab24);
  583. +
  584. + t129 = t96 + t97;
  585. + t162 = MUL(t96 - t97, costab24); */
  586. + butterfly2 r2/*t126*/, r0/*t158*/, r3/*t129*/, r1/*t162*/, costab24, r10, r11, lr
  587. +
  588. +
  589. +/*
  590. + t93 = t125 + t126;
  591. + t180 = MUL(t125 - t126, costab16) ;
  592. +
  593. + t98 = t128 + t129;
  594. + t181 = MUL(t128 - t129, costab16) ;
  595. +
  596. + t159 = t157 + t158;
  597. + t187 = MUL(t157 - t158, costab16) ;
  598. +
  599. + t163 = t161 + t162;
  600. + t188 = MUL(t161 - t162, costab16) ; */
  601. + butterfly4 r6/*t93*/, r2/*t180*/, r7/*t98*/, r3/*t181*/, r4/*t159*/, r0/*187*/, r5/*t163*/, r1/*t188*/, costab16, r10, r11, lr
  602. +
  603. +
  604. +/* Store away the computed butterflies:
  605. + sp[6-7] = t98, t93
  606. + sp[14-15] = t163, t159
  607. + sp[22-23] = t181, t180
  608. + sp[30-31] = t188, t187 */
  609. + st.d sp[6*4] , r6
  610. + st.d sp[14*4], r4
  611. + st.d sp[22*4], r2
  612. + st.d sp[30*4], r0
  613. +
  614. +/* Load {r0...r7} = { t103, t109, t102, t108, t101, t107, t100, t106 } */
  615. + ld.d r6, sp[4*4]
  616. + ld.d r4, sp[12*4]
  617. + ld.d r2, sp[20*4]
  618. + ld.d r0, sp[28*4]
  619. +
  620. +
  621. +
  622. +/* t132 = t100 + t101;
  623. + t166 = MUL(t100 - t101, costab8);
  624. +
  625. + t136 = t106 + t107;
  626. + t171 = MUL(t106 - t107, costab8); */
  627. + butterfly2 r6/*t132*/, r4/*t166*/, r7/*t136*/, r5/*t171*/, costab8, r10, r11, lr
  628. +
  629. +/* t133 = t102 + t103;
  630. + t167 = MUL(t102 - t103, costab24);
  631. +
  632. + t137 = t108 + t109;
  633. + t172 = MUL(t108 - t109, costab24);*/
  634. + butterfly2 r2/*t133*/, r0/*t167*/, r3/*t137*/, r1/*t172*/, costab24, r10, r11, lr
  635. +
  636. +
  637. +/* t104 = t132 + t133;
  638. + t182 = MUL(t132 - t133, costab16) ;
  639. +
  640. + t110 = t136 + t137;
  641. + t183 = MUL(t136 - t137, costab16) ;
  642. +
  643. + t168 = t166 + t167;
  644. + t189 = MUL(t166 - t167, costab16) ;
  645. +
  646. + t173 = t171 + t172;
  647. + t208 = MUL(t171 - t172, costab16) ; */
  648. + butterfly4 r6/*t104*/, r2/*t182*/, r7/*t110*/, r3/*t183*/, r4/*t168*/, r0/*189*/, r5/*t173*/, r1/*t208*/, costab16, r10, r11, lr
  649. +
  650. +/* Store away the computed butterflies:
  651. + sp[4-5] = t110, t104
  652. + sp[12-13] = t173, t168
  653. + sp[20-21] = t183, t182
  654. + sp[28-29] = t208, t189 */
  655. + st.d sp[4*4] , r6
  656. + st.d sp[12*4], r4
  657. + st.d sp[20*4], r2
  658. + st.d sp[28*4], r0
  659. +
  660. +/* Now we have the following stack
  661. +
  662. + sp[0-7] = t67, t58 , t32, t191, t110, t104, t98, t93
  663. + sp[8-15] = t154, t150, t146, t143, t173, t168, t163, t159
  664. + sp[16-23] = t179, t178, t177, t192, t183, t182, t181, t180
  665. + sp[24-31] = t186, t185, t184, t190, t208, t189, t188, t187
  666. +*/
  667. +
  668. + /* Get slot, lo and hi from stack */
  669. + lddsp lr, sp[32*4 + 4] /*slot*/
  670. + lddsp r12, sp[32*4 + 8] /*lo*/
  671. + lddsp r11, sp[32*4 + 12] /*hi*/
  672. +
  673. + add r12, r12, lr << 2
  674. + add r11, r11, lr << 2
  675. +
  676. +
  677. +/* t49 = -(t67 * 2) + t32;
  678. + hi[14][slot] = SHIFT(t32);
  679. + t87 = -(t110 * 2) + t67;
  680. + t138 = -(t173 * 2) + t110;
  681. + t203 = -(t208 * 2) + t173; */
  682. +
  683. + lddsp r0/*t67*/, sp[0]
  684. + lddsp r1/*t32*/, sp[2*4]
  685. + lddsp r2/*t110*/, sp[4*4]
  686. + lddsp r3/*t173*/, sp[12*4]
  687. + lddsp r5/*t208*/, sp[28*4]
  688. +
  689. + sub r4/*t49*/, r1, r0 << 1
  690. + scale r1
  691. + sub r0/*t87*/, r0, r2 << 1
  692. + st.w r11[14*SLOTS*4], r1
  693. + sub r2/*t138*/, r2, r3 << 1
  694. + sub r1/*t203*/, r3, r5 << 1
  695. +
  696. +/* Live: r0 = t87, r1= t203, r2= t138, r4 = t49
  697. + Free: r3, r5, r6, r7, r8, r9, r10, lr */
  698. +
  699. +/* t68 = (t98 * 2) + t49;
  700. + hi[12][slot] = SHIFT(-t49);
  701. + t130 = -(t163 * 2) + t98;
  702. + t201 = -(t188 * 2) + t163;
  703. + t200 = -(t186 * 2) + t154;
  704. + t111 = (t154 * 2) + t87;
  705. + t77 = -(-(t87 * 2) - t68);
  706. + t88 = (t146 * 2) + t77;
  707. + t199 = -(t184 * 2) + t146;
  708. + hi[ 8][slot] = SHIFT(-t77);
  709. + hi[10][slot] = SHIFT(t68);*/
  710. + lddsp r3/*t98*/, sp[6*4]
  711. + lddsp r5/*t163*/, sp[14*4]
  712. + lddsp r6/*t188*/, sp[30*4]
  713. + lddsp r10/*t186*/, sp[24*4]
  714. +
  715. + add r7/*t68*/, r4, r3 << 1
  716. + neg r4
  717. + scale r4
  718. + lddsp r9/*t154*/, sp[8*4]
  719. + sub r3/*t130*/, r3, r5 << 1
  720. + st.w r11[12*SLOTS*4], r4
  721. + sub r8/*t201*/, r5, r6 << 1
  722. + sub r4/*t200*/, r9, r10 << 1
  723. + lddsp lr/*t146*/, sp[10*4]
  724. + lddsp r6/*t184*/, sp[26*4]
  725. + add r10/*t111*/, r0, r9 << 1
  726. + add r5/*t77*/,r7, r0 << 1
  727. + add r0/*t88*/, r5, lr << 1
  728. + sub r6/*t199*/, lr, r6 << 1
  729. + neg r5
  730. + scale r5
  731. + scale r7
  732. + st.w r11[8*SLOTS*4], r5
  733. + st.w r11[10*SLOTS*4], r7
  734. +
  735. +/* Live: r0 = t88, r1= t203, r2= t138, r3 = t130, r4 = t200,
  736. + r6 = 199, r8 = t201, r10 = t111
  737. + Free: r5, r7, r9, lr */
  738. +
  739. +
  740. +/*
  741. + t123 = -(-(t138 * 2) - t111);
  742. + t174 = (t183 * 2) + t138;
  743. + t99 = -(t111 * 2) + t88;
  744. + hi[ 6][slot] = SHIFT(t88); */
  745. + lddsp r5/*t183*/, sp[20*4]
  746. +
  747. + add r7/*t123*/, r10, r2 << 1
  748. + sub r10/*t99*/, r0, r10 << 1
  749. + scale r0
  750. + add r2/*t174*/, r2, r5 << 1
  751. + st.w r11[6*SLOTS*4], r0
  752. +
  753. +/* Live: r1 = t203, r2 = t174, r3 = t130, r4 = t200,
  754. + r6 = t199, r7 = t123, r8 = t201, r10 = t99
  755. + Free: r0, r5, r9, lr */
  756. +
  757. +/* t112 = -(t130 * 2) + t99;
  758. + t164 = (t181 * 2) + t130;
  759. + hi[ 4][slot] = SHIFT(-t99); */
  760. + lddsp r0/*t181*/, sp[22*4]
  761. +
  762. + sub r5/*t112*/, r10, r3 << 1
  763. + neg r10
  764. + scale r10
  765. + add r3/*164*/, r3, r0 << 1
  766. + st.w r11[4*SLOTS*4], r10
  767. +
  768. +/* Live: r1 = t203, r2 = t174, r3 = t164, r4 = t200,
  769. + r5 = t112, r6 = t199, r7 = t123, r8 = t201
  770. + Free: r0, r9, r10, lr */
  771. +
  772. +
  773. +/* t117 = -(-(t123 * 2) - t112);
  774. + t139 = (t179 * 2) + t123;
  775. + hi[ 2][slot] = SHIFT(t112); */
  776. + lddsp r0/*t179*/, sp[16*4]
  777. +
  778. + add r9/*t117*/, r5, r7 << 1
  779. + scale r5
  780. + add r7/*t139*/, r7, r0 << 1
  781. + st.w r11[2*SLOTS*4], r5
  782. +
  783. +/* Live: r1 = t203, r2 = t174, r3 = t164, r4 = t200,
  784. + r6 = t199, r7 = t139, r8 = t201, r9 = t117
  785. + Free: r0, r5, r10, lr */
  786. +
  787. +/* t155 = -(t174 * 2) + t139;
  788. + t204 = -(-(t203 * 2) - t174);
  789. + t124 = (t177 * 2) + t117;
  790. + hi[ 0][slot] = SHIFT(-t117);
  791. + t131 = -(t139 * 2) + t124;
  792. + lo[ 1][slot] = SHIFT(t124);*/
  793. + lddsp r0/*t177*/, sp[18*4]
  794. +
  795. + sub r5/*t155*/, r7, r2 << 1
  796. + add r2/*t204*/, r2, r1 << 1
  797. + add r0/*t124*/, r9, r0 << 1
  798. + neg r9
  799. + scale r9
  800. + sub r7/*t131*/, r0, r7 << 1
  801. + scale r0
  802. + st.w r11[0*SLOTS*4], r9
  803. + st.w r12[1*SLOTS*4], r0
  804. +
  805. +/* Live: r2 = t204, r3 = t164, r4 = t200,
  806. + r5 = t155, r6 = t199, r7 = t131, r8 = t201
  807. + Free: r0, r1, r9, r10, lr */
  808. +
  809. +/* t140 = (t164 * 2) + t131;
  810. + lo[ 3][slot] = SHIFT(-t131);
  811. + t202 = -(-(t201 * 2) - t164); */
  812. + add r0/*t140*/, r7, r3 << 1
  813. + neg r7
  814. + scale r7
  815. + add r3/*t202*/, r3, r8 << 1
  816. + st.w r12[3*SLOTS*4], r7
  817. +
  818. +/* Live: r0 = t140, r2 = t204, r3 = t202, r4 = t200,
  819. + r5 = t155, r6 = t199
  820. + Free: r1, r7, r8, r9, r10, lr */
  821. +
  822. +
  823. +/* t147 = -(-(t155 * 2) - t140);
  824. + lo[ 5][slot] = SHIFT(t140);
  825. + t175 = -(t200 * 2) + t155;
  826. + t156 = -(t199 * 2) + t147;
  827. + lo[ 7][slot] = SHIFT(-t147); */
  828. + add r1/*t147*/, r0, r5 << 1
  829. + scale r0
  830. + sub r5/*t175*/, r5, r4 << 1
  831. + sub r4/*156*/, r1, r6 << 1
  832. + neg r1
  833. + scale r1
  834. + st.w r12[5*SLOTS*4], r0
  835. + st.w r12[7*SLOTS*4], r1
  836. +
  837. +/* Live: r2 = t204, r3 = t202,
  838. + r4 = t156, r5 = t175
  839. + Free: r0, r1, r6, r7, r8, r9, r10, lr */
  840. +
  841. +
  842. +/* t205 = -(-(t204 * 2) - t175);
  843. + t165 = -(t175 * 2) + t156;
  844. + lo[ 9][slot] = SHIFT(t156);
  845. + t176 = -(t202 * 2) + t165;
  846. + lo[11][slot] = SHIFT(-t165);
  847. + t206 = -(-(t205 * 2) - t176);
  848. + lo[15][slot] = SHIFT(-t206)
  849. + lo[13][slot] = SHIFT(t176) */
  850. + add r0/*t205*/, r5, r2 << 1
  851. + sub r1/*t165*/, r4, r5 << 1
  852. + scale r4
  853. + sub r3/*t176*/, r1, r3 << 1
  854. + st.w r12[9*SLOTS*4], r4
  855. + neg r1
  856. + scale r1
  857. + add r6/*t206*/, r3, r0 << 1
  858. + neg r6
  859. + scale r6
  860. + scale r3
  861. + st.w r12[11*SLOTS*4], r1
  862. + st.w r12[15*SLOTS*4], r6
  863. + st.w r12[13*SLOTS*4], r3
  864. +
  865. +/* t193 = -((t190 * 2) - t143)
  866. + hi[ 7][slot] = SHIFT(t143);
  867. + lo[ 8][slot] = SHIFT(-t193);
  868. + t82 = -(t104 * 2) + t58;
  869. + hi[13][slot] = SHIFT(t58);
  870. + t134 = -(t168 * 2) + t104;
  871. + t196 = -(t189 * 2) + t168; */
  872. +
  873. + lddsp r0/*t190*/, sp[27*4]
  874. + lddsp r1/*t143*/, sp[11*4]
  875. + lddsp r2/*t104*/, sp[5*4]
  876. + lddsp r3/*t58*/, sp[1*4]
  877. + lddsp r4/*t168*/, sp[13*4]
  878. + lddsp r5/*t189*/, sp[29*4]
  879. + sub r0/*t193*/, r1, r0 << 1
  880. + neg r0
  881. + scale r1
  882. + scale r0
  883. + st.w r11[7*SLOTS*4], r1
  884. + st.w r12[8*SLOTS*4], r0
  885. + sub r0/*t82*/, r3, r2 << 1
  886. + scale r3
  887. + sub r2/*t134*/, r2, r4 << 1
  888. + sub r4/*t196*/, r4, r5 << 1
  889. + st.w r11[13*SLOTS*4], r3
  890. +
  891. +/* Live: r0 = t82, r2 = t134,
  892. + r4 = t196
  893. + Free: r1, r3, r5, r6, r7, r8, r9, r10, lr */
  894. +
  895. +
  896. +
  897. +/*
  898. +
  899. + t207 = -(t185 * 2) + t150;
  900. + t105 = (t150 * 2) + t82;
  901. + hi[ 9][slot] = SHIFT(-t82);
  902. + t120 = -(-(t134 * 2) - t105);
  903. + hi[ 5][slot] = SHIFT(t105);
  904. + t169 = (t182 * 2) + t134;
  905. +
  906. + t135 = (t178 * 2) + t120;
  907. + hi[ 1][slot] = SHIFT(-t120);
  908. + t197 = -(-(t196 * 2) - t169);
  909. + t151 = -(t169 * 2) + t135;
  910. + lo[ 2][slot] = SHIFT(t135); */
  911. + lddsp r1/*t185*/, sp[25*4]
  912. + lddsp r3/*t150*/, sp[9*4]
  913. + lddsp r5/*t182*/, sp[21*4]
  914. + lddsp r8/*t178*/, sp[17*4]
  915. +
  916. + sub r6/*t207*/, r3, r1 << 1
  917. + add r3/*t105*/, r0, r3 << 1
  918. + neg r0
  919. + scale r0
  920. + add r7/*t120*/, r3, r2 << 1
  921. + scale r3
  922. + st.w r11[9*SLOTS*4], r0
  923. + st.w r11[5*SLOTS*4], r3
  924. + add r2/*t169*/, r2, r5 << 1
  925. + add r8/*t135*/, r7, r8 << 1
  926. + neg r7
  927. + scale r7
  928. + add r4/*t197*/, r2, r4 << 1
  929. + sub r2/*t151*/, r8, r2 << 1
  930. + scale r8
  931. + st.w r11[1*SLOTS*4], r7
  932. + st.w r12[2*SLOTS*4], r8
  933. +
  934. +/* Live: r2 = t151, r4 = t197, r6 = t207
  935. +
  936. + Free: r0, r1, r3, r5, r7, r8, r9, r10, lr */
  937. +
  938. +
  939. +
  940. +/* t170 = -(t207 * 2) + t151;
  941. + lo[ 6][slot] = SHIFT(-t151);
  942. +
  943. + t198 = -(-(t197 * 2) - t170);
  944. + lo[10][slot] = SHIFT(t170);
  945. + lo[14][slot] = SHIFT(-t198);
  946. +
  947. + t127 = -(t159 * 2) + t93;
  948. + hi[11][slot] = SHIFT(t93);
  949. + t194 = -(t187 * 2) + t159; */
  950. + lddsp r0/*t159*/, sp[15*4]
  951. + lddsp r1/*t93*/, sp[7*4]
  952. + lddsp r3/*t187*/, sp[31*4]
  953. + sub r5/*t170*/, r2, r6 << 1
  954. + neg r2
  955. + scale r2
  956. + add r4/*t198*/,r5, r4 << 1
  957. + neg r4
  958. + scale r5
  959. + scale r4
  960. + st.w r12[6*SLOTS*4], r2
  961. + st.w r12[10*SLOTS*4], r5
  962. + st.w r12[14*SLOTS*4], r4
  963. + sub r7/*t127*/, r1, r0 << 1
  964. + scale r1
  965. + sub r0/*t194*/, r0, r3 << 1
  966. + st.w r11[11*SLOTS*4], r1
  967. +
  968. +
  969. +/* Live: r0 = t194, r7 = t127
  970. + Free: r1, r2, r3, r4, r6, r5, r8, r9, r10, lr */
  971. +
  972. +/* t160 = (t180 * 2) + t127;
  973. + hi[ 3][slot] = SHIFT(-t127);
  974. + t195 = -(-(t194 * 2) - t160);
  975. + lo[ 4][slot] = SHIFT(t160);
  976. + lo[12][slot] = SHIFT(-t195);
  977. +
  978. + hi[15][slot] = SHIFT(t191);
  979. + lo[ 0][slot] = SHIFT(t192); */
  980. + lddsp r1/*t180*/, sp[23*4]
  981. + lddsp r2/*t191*/, sp[3*4]
  982. + lddsp r3/*t192*/, sp[19*4]
  983. + add r4/*t160*/, r7, r1 << 1
  984. + neg r7
  985. + scale r7
  986. + add r6/*t195*/, r4, r0 << 1
  987. + scale r4
  988. + neg r6
  989. + scale r6
  990. + st.w r11[3*SLOTS*4], r7
  991. + st.w r12[4*SLOTS*4], r4
  992. + st.w r12[12*SLOTS*4], r6
  993. + scale r2
  994. + scale r3
  995. + st.w r11[15*SLOTS*4], r2
  996. + st.w r12[0*SLOTS*4], r3
  997. +
  998. + sub sp, -32*4
  999. + ldm sp++,r0-r7, r9-r11, pc
  1000. diff --git a/fixed.h b/fixed.h
  1001. index 4b58abf..0a1350a 100644
  1002. --- a/fixed.h
  1003. +++ b/fixed.h
  1004. @@ -237,6 +237,46 @@ mad_fixed_t mad_f_mul_inline(mad_fixed_t x, mad_fixed_t y)
  1005. # define MAD_F_SCALEBITS MAD_F_FRACBITS
  1006. # endif
  1007. +/* --- AVR32 ----------------------------------------------------------------- */
  1008. +
  1009. +# elif defined(FPM_AVR32)
  1010. +
  1011. +typedef signed short mad_coeff_t;
  1012. +
  1013. +struct DWstruct {int high, low;};
  1014. +
  1015. +typedef union {
  1016. + struct DWstruct s;
  1017. + long long ll;
  1018. +} DWunion;
  1019. +
  1020. +# define MAD_F_MLX(hi, lo, x, y) \
  1021. + { register DWunion __res; \
  1022. + __res.ll = (long long)x * (long long)y; \
  1023. + /* asm ("muls.d\t%0, %1, %2" : "=r" (__res.ll) : "r" (x), "r" (y));*/ \
  1024. + hi = __res.s.high; \
  1025. + lo = __res.s.low; }
  1026. +
  1027. +# define MAD_F_MLA(hi, lo, x, y) \
  1028. + { register DWunion __res; \
  1029. + __res.s.high = hi; \
  1030. + __res.s.low = lo; \
  1031. + __res.ll += (long long)x * (long long)y; \
  1032. +/* asm ("macs.d\t%0, %1, %2" : "+r" (__res.ll) : "r" (x), "r" (y));*/ \
  1033. + hi = __res.s.high; \
  1034. + lo = __res.s.low; }
  1035. +
  1036. +
  1037. +# define MAD_F_MLN(hi, lo) \
  1038. + asm ("neg %0\n" \
  1039. + "acr %1\n" \
  1040. + "neg %1" \
  1041. + : "+r" (lo), "+r" (hi) \
  1042. + :: "cc")
  1043. +
  1044. +
  1045. +# define MAD_F_SCALEBITS MAD_F_FRACBITS
  1046. +
  1047. /* --- ARM ----------------------------------------------------------------- */
  1048. # elif defined(FPM_ARM)
  1049. @@ -433,6 +473,8 @@ mad_fixed_t mad_f_mul_inline(mad_fixed_t x, mad_fixed_t y)
  1050. *
  1051. * Pre-rounding is required to stay within the limits of compliance.
  1052. */
  1053. +typedef signed int mad_coeff_t;
  1054. +
  1055. # if defined(OPT_SPEED)
  1056. # define mad_f_mul(x, y) (((x) >> 12) * ((y) >> 16))
  1057. # else
  1058. diff --git a/imdct_avr32.S b/imdct_avr32.S
  1059. new file mode 100644
  1060. index 0000000..d0ee6b4
  1061. --- /dev/null
  1062. +++ b/imdct_avr32.S
  1063. @@ -0,0 +1,789 @@
  1064. +/*
  1065. + Optimized 36-point Inverse Modified Cosine Transform (IMDCT)
  1066. + Copyright 2003-2006 Atmel Corporation.
  1067. +
  1068. + Written by Ronny Pedersen, Atmel Norway
  1069. +
  1070. + This program is free software; you can redistribute it and/or modify
  1071. + it under the terms of the GNU General Public License as published by
  1072. + the Free Software Foundation; either version 2 of the License, or
  1073. + (at your option) any later version.
  1074. +
  1075. + This program is distributed in the hope that it will be useful,
  1076. + but WITHOUT ANY WARRANTY; without even the implied warranty of
  1077. + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  1078. + GNU General Public License for more details.
  1079. +
  1080. + You should have received a copy of the GNU General Public License
  1081. + along with this program; if not, write to the Free Software
  1082. + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
  1083. +
  1084. +#define MAD_F(x) ((x + (1 << 13)) >> 14)
  1085. +
  1086. + .public imdct36_avr32
  1087. +
  1088. +/*
  1089. + void imdct36(mad_fixed_t const x[18], mad_fixed_t y[36])
  1090. + {
  1091. + mad_fixed_t tmp[18];
  1092. + int i;
  1093. +*/
  1094. +/* DCT-IV */
  1095. +imdct36_avr32:
  1096. + pushm r0-r7,r11,lr
  1097. + sub sp, 4*18
  1098. +/*
  1099. + {
  1100. + mad_fixed_t tmp2[18];
  1101. + int i;
  1102. +
  1103. + /* scale[i] = 2 * cos(PI * (2 * i + 1) / (4 * 18)) */
  1104. +/*
  1105. + static mad_fixed_t const scale[18] = {
  1106. + MAD_F(0x1ff833fa), MAD_F(0x1fb9ea93), MAD_F(0x1f3dd120),
  1107. + MAD_F(0x1e84d969), MAD_F(0x1d906bcf), MAD_F(0x1c62648b),
  1108. + MAD_F(0x1afd100f), MAD_F(0x1963268b), MAD_F(0x1797c6a4),
  1109. + MAD_F(0x159e6f5b), MAD_F(0x137af940), MAD_F(0x11318ef3),
  1110. + MAD_F(0x0ec6a507), MAD_F(0x0c3ef153), MAD_F(0x099f61c5),
  1111. + MAD_F(0x06ed12c5), MAD_F(0x042d4544), MAD_F(0x0165547c)
  1112. + };
  1113. +*/
  1114. +
  1115. + /* scaling */
  1116. +
  1117. +/*
  1118. + for (i = 0; i < 18; i += 3) {
  1119. + tmp2[i + 0] = mad_f_mul(x[i + 0], scale[i + 0]);
  1120. + tmp2[i + 1] = mad_f_mul(x[i + 1], scale[i + 1]);
  1121. + tmp2[i + 2] = mad_f_mul(x[i + 2], scale[i + 2]);
  1122. + }
  1123. +*/
  1124. + /* even input butterfly */
  1125. +
  1126. +/*
  1127. + for (i = 0; i < 9; i += 3) {
  1128. + tmp3[i + 0] = tmp2[i + 0] + tmp2[18 - (i + 0) - 1];
  1129. + tmp3[i + 1] = tmp2[i + 1] + tmp2[18 - (i + 1) - 1];
  1130. + tmp3[i + 2] = tmp2[i + 2] + tmp2[18 - (i + 2) - 1];
  1131. + }
  1132. + for (i = 0; i < 9; i += 3) {
  1133. + tmp4[i + 0] = tmp2[i + 0] - tmp2[18 - (i + 0) - 1];
  1134. + tmp4[i + 1] = tmp2[i + 1] - tmp2[18 - (i + 1) - 1];
  1135. + tmp4[i + 2] = tmp2[i + 2] - tmp2[18 - (i + 2) - 1];
  1136. + }
  1137. +*/
  1138. +
  1139. + ld.d r8, r12[0] /*r8 = x[1], r9 = x[0]*/
  1140. + ld.d r0, pc[scale_dctIV - .] /*r0 = {scale[2], scale[3]}, r1 = { scale[0], scale[1] }*/
  1141. + ld.d r2, r12[2*4] /*r2 = x[3], r3 = x[2]*/
  1142. + ld.d r4, pc[scale_dctIV - . + 14*2] /*r4 = {scale[16], scale[17]}, r5 = { scale[14], scale[15] }*/
  1143. + mulsatrndwh.w r9/*tmp2[0]*/, r9, r1:t /*tmp2[0] = mad_f_mul(x[0], scale[0]) */
  1144. + ld.d r6, r12[16*4] /*r6 = x[17], r7 = x[16]*/
  1145. + mulsatrndwh.w r8/*tmp2[1]*/, r8, r1:b /*tmp2[1] = mad_f_mul(x[1], scale[1]) */
  1146. + mulsatrndwh.w r3/*tmp2[2]*/, r3, r0:t /*tmp2[2] = mad_f_mul(x[2], scale[2]) */
  1147. + mulsatrndwh.w r2/*tmp2[3]*/, r2, r0:b /*tmp2[3] = mad_f_mul(x[3], scale[3]) */
  1148. + ld.d r0, r12[14*4] /*r0 = x[15], r1 = x[14]*/
  1149. + mulsatrndwh.w r7/*tmp2[16]*/, r7, r4:t /*tmp2[16] = mad_f_mul(x[16], scale[16]) */
  1150. + mulsatrndwh.w r6/*tmp2[17]*/, r6, r4:b /*tmp2[17] = mad_f_mul(x[17], scale[17]) */
  1151. + mulsatrndwh.w r1/*tmp2[14]*/, r1, r5:t /*tmp2[14] = mad_f_mul(x[14], scale[14]) */
  1152. + mulsatrndwh.w r0/*tmp2[15]*/, r0, r5:b /*tmp2[15] = mad_f_mul(x[15], scale[15]) */
  1153. +
  1154. + ld.d r4, r12[4*4] /*r4 = x[5], r5 = x[4]*/
  1155. +
  1156. + sub lr/*tmp4[0]*/, r9, r6
  1157. + add r6/*tmp3[0]*/, r9, r6
  1158. + sub r10/*tmp4[1]*/, r8, r7
  1159. + add r7/*tmp3[1]*/, r8, r7
  1160. + sub r9/*tmp4[2]*/, r3, r0
  1161. + add r0/*tmp3[2]*/, r3, r0
  1162. + sub r8/*tmp4[3]*/, r2, r1
  1163. + add r1/*tmp3[3]*/, r2, r1
  1164. +
  1165. + ld.d r2, pc[scale_dctIV - . + 4*2] /*r2 = {scale[6], scale[7]}, r3 = { scale[4], scale[5] }*/
  1166. +
  1167. + stm --sp, r8-r10, lr /*sp[0] = tmp4[0],sp[1] = tmp4[1],
  1168. + sp[2] = tmp4[2],sp[3] = tmp4[3] */
  1169. +
  1170. + /* Registers used: r0 = tmp3[2], r1 = tmp3[3], r6 = tmp3[0], r7 = tmp3[1], r12 = x
  1171. + Free registers: r2-r5, r8-r11, lr
  1172. + */
  1173. + ld.d r8, r12[6*4] /*r8 = x[7], r9 = x[6]*/
  1174. + ld.d r10, pc[scale_dctIV - . + 10*2] /*r10 = {scale[12], scale[13]}, r11 = { scale[10], scale[11] }*/
  1175. + mulsatrndwh.w r5/*tmp2[4]*/, r5, r3:t /*tmp2[4] = mad_f_mul(x[4], scale[4]) */
  1176. + mulsatrndwh.w r4/*tmp2[5]*/, r4, r3:b /*tmp2[5] = mad_f_mul(x[5], scale[5]) */
  1177. + mulsatrndwh.w r9/*tmp2[6]*/, r9, r2:t /*tmp2[6] = mad_f_mul(x[6], scale[6]) */
  1178. + mulsatrndwh.w r8/*tmp2[7]*/, r8, r2:b /*tmp2[7] = mad_f_mul(x[7], scale[7]) */
  1179. +
  1180. + ld.d r2, r12[12*4] /*r2 = x[13], r3 = x[12]*/
  1181. + ld.w lr, r12[11*4] /*lr = x[11] */
  1182. + mulsatrndwh.w r3/*tmp2[12]*/, r3, r10:t /*tmp2[12] = mad_f_mul(x[12], scale[12]) */
  1183. + mulsatrndwh.w r2/*tmp2[13]*/, r2, r10:b /*tmp2[13] = mad_f_mul(x[13], scale[13]) */
  1184. + ld.w r10, r12[10*4] /*r10 = x[10] */
  1185. + mulsatrndwh.w lr/*tmp2[11]*/, lr, r11:b /*tmp2[11] = mad_f_mul(x[11], scale[11]) */
  1186. + mulsatrndwh.w r10/*tmp2[10]*/, r10, r11:t /*tmp2[10] = mad_f_mul(x[10], scale[10]) */
  1187. +
  1188. + sub r11/*tmp4[4]*/, r5, r2
  1189. + add r2/*tmp3[4]*/, r5, r2
  1190. + sub r5/*tmp4[5]*/, r4, r3
  1191. + add r3/*tmp3[5]*/, r4, r3
  1192. + sub r4/*tmp4[6]*/, r9, lr
  1193. + add lr/*tmp3[6]*/, r9, lr
  1194. + sub r9/*tmp4[7]*/, r8, r10
  1195. + add r10/*tmp3[7]*/, r8, r10
  1196. + lddpc r8, scale_dctIV + 8*2 /*r8 = {scale[8], scale[9]} */
  1197. +
  1198. + stm --sp, r4, r5, r9, r11 /*sp[0] = tmp4[4],sp[1] = tmp4[7],
  1199. + sp[2] = tmp4[5],sp[3] = tmp4[6] */
  1200. + ld.d r4, r12[8*4] /*r4 = x[9], r5 = x[8]*/
  1201. + mulsatrndwh.w r5/*tmp2[8]*/, r5, r8:t /*tmp2[8] = mad_f_mul(x[8], scale[8]) */
  1202. + mulsatrndwh.w r4/*tmp2[9]*/, r4, r8:b /*tmp2[9] = mad_f_mul(x[9], scale[9]) */
  1203. + sub r9/*tmp4[8]*/, r5, r4
  1204. + add r5/*tmp3[8]*/, r5, r4
  1205. +
  1206. + st.w --sp, r9 /* sp[0] = tmp4[8] */
  1207. +
  1208. + /* Registers used:
  1209. +
  1210. + r0=tmp3[2], r1=tmp3[3], r2=tmp3[4], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
  1211. + r7 = tmp3[1], r10=tmp3[7], lr=tmp3[6]
  1212. + Free registers:
  1213. + r4, r8, r9, r11, r12
  1214. + */
  1215. +
  1216. +
  1217. + /* SDCT-II */
  1218. +/*
  1219. +
  1220. + {
  1221. + mad_fixed_t tmp3[9];
  1222. + int i;
  1223. +*/
  1224. + /* scale[i] = 2 * cos(PI * (2 * i + 1) / (2 * 18)) */
  1225. +/*
  1226. + static mad_fixed_t const scale[9] = {
  1227. + MAD_F(0x1fe0d3b4), MAD_F(0x1ee8dd47), MAD_F(0x1d007930),
  1228. + MAD_F(0x1a367e59), MAD_F(0x16a09e66), MAD_F(0x125abcf8),
  1229. + MAD_F(0x0d8616bc), MAD_F(0x08483ee1), MAD_F(0x02c9fad7)
  1230. + };
  1231. +*/
  1232. + /* divide the 18-point SDCT-II into two 9-point SDCT-IIs */
  1233. +
  1234. +
  1235. + /* fastdct */
  1236. +
  1237. +/*
  1238. + {
  1239. + mad_fixed_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12;
  1240. + mad_fixed_t a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25;
  1241. + mad_fixed_t m0, m1, m2, m3, m4, m5, m6, m7;
  1242. +*/
  1243. +// enum {
  1244. +// c0 = MAD_F(0x1f838b8d), /* 2 * cos( 1 * PI / 18) */
  1245. +// c1 = MAD_F(0x1bb67ae8), /* 2 * cos( 3 * PI / 18) */
  1246. +// c2 = MAD_F(0x18836fa3), /* 2 * cos( 4 * PI / 18) */
  1247. +// c3 = MAD_F(0x1491b752), /* 2 * cos( 5 * PI / 18) */
  1248. +// c4 = MAD_F(0x0af1d43a), /* 2 * cos( 7 * PI / 18) */
  1249. +// c5 = MAD_F(0x058e86a0), /* 2 * cos( 8 * PI / 18) */
  1250. +// c6 = -MAD_F(0x1e11f642) /* 2 * cos(16 * PI / 18) */
  1251. +// };
  1252. +
  1253. +/*
  1254. + a2 = tmp3[6] + tmp3[2];
  1255. + a6 = tmp3[8] + tmp3[0];
  1256. + a11 = a2 - a6;
  1257. + m5 = mad_f_mul(a11, -c6) ;
  1258. + a4 = tmp3[1] + tmp3[7];
  1259. +
  1260. + a18 = tmp3[4] + a4;
  1261. + a19 = -2 * tmp3[4] + a4;
  1262. +
  1263. + a0 = tmp3[3] + tmp3[5];
  1264. +
  1265. +*/
  1266. + add r11/*a4*/, r7, r10
  1267. + add r12/*a18*/, r2, r11
  1268. + sub r11/*a19*/, r11, r2<<1
  1269. +
  1270. + add r4/*a2*/, lr, r0
  1271. + add r8/*a6*/, r5, r6
  1272. + sub r9/*a11*/, r4, r8
  1273. +
  1274. + st.d --sp, r0 /* sp[0] = tmp3[3], sp1[1] = tmp3[2]*/
  1275. +
  1276. + mov r2, MAD_F(0x1e11f642)
  1277. + mulsatrndwh.w r9/*m5*/, r9, r2:b
  1278. +
  1279. + add r2/*a0*/, r1, r3
  1280. +
  1281. + /* Registers used:
  1282. +
  1283. + r2=a0, r3=tmp3[5], r4=a2, r5=tmp3[8], r6 = tmp3[0],
  1284. + r7 = tmp3[1], r8=a6, r10=tmp3[7], r9=m5, r11=a19, r12=a18,lr=tmp3[6]
  1285. + Free registers:
  1286. + r0, r1
  1287. + */
  1288. +
  1289. +/*
  1290. + a8 = a0 + a2;
  1291. + a12 = a8 + a6;
  1292. + a10 = a0 - a6;
  1293. + a9 = a0 - a2;
  1294. + m7 = mad_f_mul(a9, -c2) ;
  1295. + m6 = mad_f_mul(a10, -c5) ;
  1296. +*/
  1297. +
  1298. + add r0/*a8*/, r2, r4
  1299. + add r0/*a12*/, r8
  1300. + rsub r8/*a10*/, r2
  1301. + sub r2/*a9*/, r4
  1302. + mov r1, -MAD_F(0x18836fa3)
  1303. + mulsatrndwh.w r2/*m7*/, r2, r1:b
  1304. + mov r1, -MAD_F(0x058e86a0)
  1305. + mulsatrndwh.w r8/*m6*/, r8, r1:b
  1306. +
  1307. + /* Registers used:
  1308. +
  1309. + r0=a12, r2=m7, r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
  1310. + r7 = tmp3[1], r8=m6, r10=tmp3[7], r9=m5, r11=a19, r12=a18,lr=tmp3[6]
  1311. + Free registers:
  1312. + r1, r4
  1313. + */
  1314. +
  1315. +
  1316. +/*
  1317. + a21 = -a19 - (m5 << 1);
  1318. + tmp[ 8] = a21 - (m6 << 1);
  1319. +
  1320. + a20 = a19 - (m5 << 1);
  1321. + tmp[ 4] = (m7 << 1) + a20;
  1322. + a22 = -a19 + (m6 << 1);
  1323. + tmp[16] = a22 + (m7 << 1);
  1324. + tmp[ 0] = a18 + a12;
  1325. + tmp[12] = a12 - 2 * a18;
  1326. +*/
  1327. + add r1/*a21*/, r11, r9 << 1
  1328. + neg r1
  1329. + sub r1/*tmp[8]*/, r1, r8 << 1
  1330. + stdsp sp[4*11/*tmp3[..] on the stack*/ + 8*4], r1
  1331. + sub r4/*a20*/, r11, r9 << 1
  1332. + add r4/*tmp[4]*/, r4, r2 << 1
  1333. + stdsp sp[4*11/*tmp3[..] on the stack*/ + 4*4], r4
  1334. + neg r11
  1335. + add r1/*a22*/, r11, r8 << 1
  1336. + add r1/*tmp[16]*/, r1, r2 << 1
  1337. + stdsp sp[4*11/*tmp3[..] on the stack*/ + 16*4], r1
  1338. + add r4, r12, r0
  1339. + sub r1, r0, r12 << 1
  1340. + stdsp sp[4*11/*tmp3[..] on the stack*/ + 0*4], r4
  1341. + stdsp sp[4*11/*tmp3[..] on the stack*/ + 12*4], r1
  1342. +
  1343. + ld.d r0, sp++
  1344. +
  1345. + /* Registers used:
  1346. +
  1347. + r0 = tmp3[2], r1 = tmp3[3], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
  1348. + r7 = tmp3[1], r10=tmp3[7], r11=a19, lr=tmp3[6]
  1349. + Free registers:
  1350. + r2,r4,r8,r9,r12
  1351. + */
  1352. +
  1353. +/*
  1354. + a5 = tmp3[1] - tmp3[7];
  1355. + a7 = tmp3[8] - tmp3[0];
  1356. + a3 = tmp3[6] - tmp3[2];
  1357. + a1 = tmp3[3] - tmp3[5];
  1358. + a13 = a1 - a3;
  1359. + a14 = a13 + a7;
  1360. + m3 = mad_f_mul(a14, -c1) ;
  1361. + m4 = mad_f_mul(a5, -c1) ;
  1362. + tmp[ 6] = m3 << 1;
  1363. +*/
  1364. + sub r7/*a5*/, r10
  1365. + sub r2/*a7*/, r5, r6
  1366. + sub r4/*a3*/, lr, r0
  1367. + sub r8/*a1*/, r1, r3
  1368. + sub r9/*a13*/, r8, r4
  1369. + add r12/*a14*/, r9, r2
  1370. + mov r0, -MAD_F(0x1bb67ae8)
  1371. + mulsatrndwh.w r12/*m3*/, r12, r0:b
  1372. + mulsatrndwh.w r7/*m4*/, r7, r0:b
  1373. + lsl r12, 1
  1374. + stdsp sp[4*9/*tmp3[..] on the stack*/ + 6*4], r12
  1375. +
  1376. + /* Registers used:
  1377. + r2 = a7, r4 = a3, r7 = m4, r8 = a1, r12 = m3
  1378. +
  1379. + Free registers:
  1380. + r0, r1, r3, r5, r6, r10, r9, r11, lr
  1381. + */
  1382. +
  1383. +
  1384. +/*
  1385. + a15 = a3 + a7;
  1386. + m2 = mad_f_mul(a15, -c4) ;
  1387. + a17 = a1 + a3;
  1388. + m0 = mad_f_mul(a17, -c3) ;
  1389. + a23 = (m4 << 1) + (m2 << 1);
  1390. + tmp[14] = a23 + (m0 << 1); */
  1391. + add r0/*a15*/, r4, r2
  1392. + mov r1, -MAD_F(0x0af1d43a)
  1393. + mulsatrndwh.w r0/*m2*/, r0, r1:b
  1394. + mov r3, -MAD_F(0x1491b752)
  1395. + add r5/*a17*/, r8, r4
  1396. + mulsatrndwh.w r5/*m0*/, r5, r3:b
  1397. + lsl r7, 1
  1398. + add r6/*a23*/, r7, r0 << 1
  1399. + add r6/*tmp[14]*/, r6, r5 << 1
  1400. + stdsp sp[4*9/*tmp3[..] on the stack*/ + 14*4], r6
  1401. +
  1402. + /* Registers used:
  1403. + r0 = m2, r2 = a7, r5 = m0, r7 = m4, r8 = a1
  1404. +
  1405. + Free registers:
  1406. + r1, r3, r4, r6, r10, r9, r11, lr
  1407. + */
  1408. +
  1409. +/*
  1410. + a16 = a1 - a7;
  1411. + m1 = mad_f_mul(a16, -c0) ;
  1412. + a24 = (m4 << 1) - (m2 << 1);
  1413. + tmp[10] = a24 - (m1 << 1);
  1414. +
  1415. + a25 = (m4 << 1) + (m1 << 1);
  1416. + tmp[ 2] = (m0 << 1) - a25;
  1417. +*/
  1418. + sub r3/*a16*/, r8, r2
  1419. + mov r4, -MAD_F(0x1f838b8d)
  1420. + mulsatrndwh.w r3/*m1*/, r3, r4:b
  1421. + sub r1/*a24*/, r7, r0 << 1
  1422. + sub r1/*tmp[10]*/, r1, r3 << 1
  1423. + stdsp sp[4*9/*tmp3[..] on the stack*/ + 10*4], r1
  1424. + add r7/*a25*/, r7, r3 << 1
  1425. + sub r7, r7, r5 << 1
  1426. + neg r7
  1427. + stdsp sp[4*9/*tmp3[..] on the stack*/ + 2*4], r7
  1428. +
  1429. +
  1430. +
  1431. +
  1432. + /* output to every other slot for convenience */
  1433. +
  1434. + /*} */
  1435. + /* End fastdct */
  1436. +
  1437. + /* odd input butterfly and scaling */
  1438. +
  1439. +
  1440. + /* On the stack:
  1441. + sp[0] = tmp4[8], sp[1] = tmp4[4],sp[2] = tmp4[7], sp[3] = tmp4[5],sp[4] = tmp4[6]
  1442. + sp[5] = tmp4[0], sp[6] = tmp4[1],sp[7] = tmp4[2],sp[8] = tmp4[3]
  1443. + */
  1444. +
  1445. + /*
  1446. + tmp3[0] = mad_f_mul(tmp4[0], scale[0]);
  1447. + tmp3[1] = mad_f_mul(tmp4[1], scale[1]) << 1;
  1448. + tmp3[2] = mad_f_mul(tmp4[2], scale[2]);
  1449. + tmp3[3] = mad_f_mul(tmp4[3], scale[3]) << 1;
  1450. + tmp3[4] = mad_f_mul(tmp4[4], scale[4]);
  1451. + tmp3[5] = mad_f_mul(tmp4[5], scale[5]);
  1452. + tmp3[6] = mad_f_mul(tmp4[6], scale[6]) << 1;
  1453. + tmp3[7] = mad_f_mul(tmp4[7], scale[7]);
  1454. + tmp3[8] = mad_f_mul(tmp4[8], scale[8]) << 1;
  1455. + */
  1456. + /* Registers used:
  1457. + r1 = tmp4[3], r2 = tmp4[2], r3 = tmp4[1], r4 = tmp4[0], r7 = tmp4[6]
  1458. + r10 = tmp4[5], r11 = tmp4[7], r12 = tmp4[4], lr = tmp4[8]
  1459. +
  1460. + Free registers:
  1461. + r0, r5, r6, r8, r9
  1462. + */
  1463. + ld.d r8, pc[ scale_sdctII - . + 4*2] /* r8 = { scale[6], scale[7] }, r9 = { scale[4], scale[5]} */
  1464. + ldm sp++, r1, r2, r3, r4, r7, r10, r11, r12, lr
  1465. + mov r5, MAD_F(0x02c9fad7) /* r3 = scale[8] */
  1466. + mulsatrndwh.w r5/*tmp3[8]*/, lr, r5:b
  1467. + mulsatrndwh.w lr/*tmp3[6]*/, r7, r8:t
  1468. + ld.d r6, pc[ scale_sdctII - . + 0*2] /* r6 = { scale[2], scale[3] }, r7 = { scale[0], scale[1]} */
  1469. + lsl lr, 1
  1470. + lsl r5, 1
  1471. + mulsatrndwh.w r0/*tmp3[2]*/, r2, r6:t
  1472. + mulsatrndwh.w r1/*tmp3[3]*/, r1, r6:b
  1473. + mulsatrndwh.w r6/*tmp3[0]*/, r4, r7:t
  1474. + mulsatrndwh.w r7/*tmp3[1]*/, r3, r7:b
  1475. + mulsatrndwh.w r3/*tmp3[5]*/, r10, r9:b
  1476. + mulsatrndwh.w r2/*tmp3[4]*/, r12, r9:t
  1477. + mulsatrndwh.w r9/*tmp3[7]*/, r11, r8:b
  1478. + lsl r1, 1
  1479. + lsl r7, 1
  1480. +
  1481. +
  1482. + /* fastdct */
  1483. +
  1484. +/*
  1485. + {
  1486. + mad_fixed_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12;
  1487. + mad_fixed_t a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25;
  1488. + mad_fixed_t m0, m1, m2, m3, m4, m5, m6, m7;
  1489. +*/
  1490. +// enum {
  1491. +// c0 = MAD_F(0x1f838b8d), /* 2 * cos( 1 * PI / 18) */
  1492. +// c1 = MAD_F(0x1bb67ae8), /* 2 * cos( 3 * PI / 18) */
  1493. +// c2 = MAD_F(0x18836fa3), /* 2 * cos( 4 * PI / 18) */
  1494. +// c3 = MAD_F(0x1491b752), /* 2 * cos( 5 * PI / 18) */
  1495. +// c4 = MAD_F(0x0af1d43a), /* 2 * cos( 7 * PI / 18) */
  1496. +// c5 = MAD_F(0x058e86a0), /* 2 * cos( 8 * PI / 18) */
  1497. +// c6 = -MAD_F(0x1e11f642) /* 2 * cos(16 * PI / 18) */
  1498. +// };
  1499. +
  1500. + /* Registers used:
  1501. +
  1502. + r0=tmp3[2], r1=tmp3[3], r2=tmp3[4], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
  1503. + r7 = tmp3[1], r9=tmp3[7], lr=tmp3[6]
  1504. + Free registers:
  1505. + r4, r8, r10, r11, r12
  1506. + */
  1507. +
  1508. +/*
  1509. + a2 = tmp3[6] + (tmp3[2] << 1);
  1510. + a6 = tmp3[8] + (tmp3[0] << 1);
  1511. + a11 = a2 - a6;
  1512. + m5 = mad_f_mul(a11, c6) ;
  1513. + a4 = tmp3[1] + (tmp3[7] << 1);
  1514. +
  1515. + a18 = (tmp3[4] << 1) + a4;
  1516. + a19 = -2 * (tmp3[4] << 1) + a4;
  1517. +
  1518. + a0 = tmp3[3] + (tmp3[5] << 1);
  1519. +
  1520. +*/
  1521. + add r11/*a4*/, r7, r9 << 1
  1522. + add r12/*a18*/, r11, r2 << 1
  1523. + sub r11/*a19*/, r11, r2 << 2
  1524. +
  1525. + add r4/*a2*/, lr, r0 << 1
  1526. + add r8/*a6*/, r5, r6 << 1
  1527. + sub r10/*a11*/, r4, r8
  1528. +
  1529. + st.d --sp, r0 /* sp[0] = tmp3[3], sp1[1] = tmp3[2]*/
  1530. +
  1531. + mov r2, -MAD_F(0x1e11f642)
  1532. + mulsatrndwh.w r10/*m5*/, r10, r2:b
  1533. +
  1534. + add r2/*a0*/, r1, r3 << 1
  1535. +
  1536. + /* Registers used:
  1537. +
  1538. + r2=a0, r3=tmp3[5], r4=a2, r5=tmp3[8], r6 = tmp3[0],
  1539. + r7 = tmp3[1], r8=a6, r9=tmp3[7], r10=m5, r11=a19, r12=a18,lr=tmp3[6]
  1540. + Free registers:
  1541. + r0, r1
  1542. + */
  1543. +
  1544. +/*
  1545. + a8 = a0 + a2;
  1546. + a12 = a8 + a6;
  1547. + a10 = a0 - a6;
  1548. + a9 = a0 - a2;
  1549. + m7 = mad_f_mul(a9, -c2) ;
  1550. + m6 = mad_f_mul(a10, -c5) ;
  1551. +*/
  1552. +
  1553. + add r0/*a8*/, r2, r4
  1554. + add r0/*a12*/, r8
  1555. + rsub r8/*a10*/, r2
  1556. + sub r2/*a9*/, r4
  1557. + mov r1, -MAD_F(0x18836fa3)
  1558. + mulsatrndwh.w r2/*m7*/, r2, r1:b
  1559. + mov r1, -MAD_F(0x058e86a0)
  1560. + mulsatrndwh.w r8/*m6*/, r8, r1:b
  1561. +
  1562. + /* Registers used:
  1563. +
  1564. + r0=a12, r2=m7, r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
  1565. + r7 = tmp3[1], r8=m6, r9=tmp3[7], r10=m5, r11=a19, r12=a18,lr=tmp3[6]
  1566. + Free registers:
  1567. + r1, r4
  1568. + */
  1569. +
  1570. +
  1571. +/*
  1572. + a21 = -a19 + (m5 << 1);
  1573. + tmp[ 9] = a21 - (m6 << 1);
  1574. +
  1575. + a20 = -(-a19 - (m5 << 1));
  1576. + tmp[ 5] = (m7 << 1) + a20;
  1577. + a22 = -a19 + (m6 << 1);
  1578. + tmp[17] = a22 + (m7 << 1);
  1579. + tmp[ 1] = a18 + a12;
  1580. + tmp[13] = a12 - 2 * a18;
  1581. +*/
  1582. + sub r1/*a21*/, r11, r10 << 1
  1583. + neg r1
  1584. + sub r1/*tmp[9]*/, r1, r8 << 1
  1585. + stdsp sp[4*2/*tmp3[..] on the stack*/ + 9*4], r1
  1586. + add r4/*a20*/, r11, r10 << 1
  1587. + add r4/*tmp[5]*/, r4, r2 << 1
  1588. + stdsp sp[4*2/*tmp3[..] on the stack*/ + 5*4], r4
  1589. + neg r11
  1590. + add r1/*a22*/, r11, r8 << 1
  1591. + add r1/*tmp[17]*/, r1, r2 << 1
  1592. + stdsp sp[4*2/*tmp3[..] on the stack*/ + 17*4], r1
  1593. + add r4, r12, r0
  1594. + sub r1, r0, r12 << 1
  1595. + stdsp sp[4*2/*tmp3[..] on the stack*/ + 1*4], r4
  1596. + stdsp sp[4*2/*tmp3[..] on the stack*/ + 13*4], r1
  1597. +
  1598. + ld.d r0, sp++
  1599. +
  1600. + /* Registers used:
  1601. +
  1602. + r0 = tmp3[2], r1 = tmp3[3], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
  1603. + r7 = tmp3[1], r9=tmp3[7], r11=a19, lr=tmp3[6]
  1604. + Free registers:
  1605. + r2,r4,r8,r10,r12
  1606. + */
  1607. +
  1608. +/*
  1609. + a5 = tmp3[1] - (tmp3[7] << 1);
  1610. + a7 = tmp3[8] - (tmp3[0] << 1);
  1611. + a3 = tmp3[6] - (tmp3[2] << 1);
  1612. + a1 = tmp3[3] - (tmp3[5] << 1);
  1613. + a13 = a1 - a3;
  1614. + a14 = a13 + a7;
  1615. + m3 = mad_f_mul(a14, -c1) ;
  1616. + m4 = mad_f_mul(a5, -c1) ;
  1617. + tmp[ 7] = m3 << 1;
  1618. +*/
  1619. + sub r7/*a5*/, r7, r9 << 1
  1620. + sub r2/*a7*/, r5, r6 << 1
  1621. + sub r4/*a3*/, lr, r0 << 1
  1622. + sub r8/*a1*/, r1, r3 << 1
  1623. + sub r10/*a13*/, r8, r4
  1624. + add r12/*a14*/, r10, r2
  1625. + mov r0, -MAD_F(0x1bb67ae8)
  1626. + mulsatrndwh.w r12/*m3*/, r12, r0:b
  1627. + mulsatrndwh.w r7/*m4*/, r7, r0:b
  1628. + lsl r12, 1
  1629. + stdsp sp[7*4], r12
  1630. +
  1631. + /* Registers used:
  1632. + r2 = a7, r4 = a3, r7 = m4, r8 = a1, r12 = m3
  1633. +
  1634. + Free registers:
  1635. + r0, r1, r3, r5, r6, r9, r10, r11, lr
  1636. + */
  1637. +
  1638. +
  1639. +/*
  1640. + a15 = a3 + a7;
  1641. + m2 = mad_f_mul(a15, -c4) ;
  1642. + a17 = a1 + a3;
  1643. + m0 = mad_f_mul(a17, -c3) ;
  1644. + a23 = (m4 << 1) + (m2 << 1);
  1645. + tmp[15] = a23 + (m0 << 1); */
  1646. + add r0/*a15*/, r4, r2
  1647. + mov r1, -MAD_F(0x0af1d43a)
  1648. + mulsatrndwh.w r0/*m2*/, r0, r1:b
  1649. + mov r3, -MAD_F(0x1491b752)
  1650. + add r5/*a17*/, r8, r4
  1651. + mulsatrndwh.w r5/*m0*/, r5, r3:b
  1652. + lsl r7, 1
  1653. + add r6/*a23*/, r7, r0 << 1
  1654. + add r6/*tmp[15]*/, r6, r5 << 1
  1655. + stdsp sp[15*4], r6
  1656. +
  1657. + /* Registers used:
  1658. + r0 = m2, r2 = a7, r5 = m0, r7 = m4, r8 = a1
  1659. +
  1660. + Free registers:
  1661. + r1, r3, r4, r6, r9, r10, r11, lr
  1662. + */
  1663. +
  1664. +/*
  1665. + a16 = a1 - a7;
  1666. + m1 = mad_f_mul(a16, -c0) ;
  1667. + a24 = (m4 << 1) - (m2 << 1);
  1668. + tmp[11] = a24 - (m1 << 1);
  1669. +
  1670. + a25 = (m4 << 1) + (m1 << 1);
  1671. + tmp[ 3] = (m0 << 1) - a25;
  1672. +*/
  1673. + sub r3/*a16*/, r8, r2
  1674. + mov r4, -MAD_F(0x1f838b8d)
  1675. + mulsatrndwh.w r3/*m1*/, r3, r4:b
  1676. + sub r1/*a24*/, r7, r0 << 1
  1677. + sub r1/*tmp[11]*/, r1, r3 << 1
  1678. + stdsp sp[11*4], r1
  1679. + add r7/*a25*/, r7, r3 << 1
  1680. + sub r7, r7, r5 << 1
  1681. + neg r7
  1682. + lddsp r12, sp[4*18+4] /* Get y from stack */
  1683. + stdsp sp[3*4], r7
  1684. +
  1685. +
  1686. + /* output to every other slot for convenience */
  1687. +
  1688. + /* End fastdct */
  1689. +
  1690. + /* output accumulation */
  1691. +
  1692. +/* for (i = 3; i < 18; i += 8) {
  1693. + tmp[i + 0] -= tmp[(i + 0) - 2];
  1694. + tmp[i + 2] -= tmp[(i + 2) - 2];
  1695. + tmp[i + 4] -= tmp[(i + 4) - 2];
  1696. + tmp[i + 6] -= tmp[(i + 6) - 2];
  1697. + }
  1698. + }
  1699. +*/
  1700. +
  1701. +/* End SDCT-II */
  1702. +
  1703. +
  1704. +
  1705. + /* scale reduction and output accumulation */
  1706. +
  1707. +/*
  1708. + for (i = 1; i < 17; i += 4) {
  1709. + tmp[i + 0] = tmp[i + 0] - tmp[(i + 0) - 1];
  1710. + tmp[i + 1] = tmp[i + 1] - tmp[(i + 1) - 1];
  1711. + tmp[i + 2] = tmp[i + 2] - tmp[(i + 2) - 1];
  1712. + tmp[i + 3] = tmp[i + 3] - tmp[(i + 3) - 1];
  1713. + }
  1714. + tmp[17] = tmp[17] - tmp[16];
  1715. + }
  1716. +*/
  1717. +/* End DCT-IV */
  1718. +
  1719. +
  1720. + /* convert 18-point DCT-IV to 36-point IMDCT */
  1721. +
  1722. +/*
  1723. + for (i = 0; i < 9; i += 3) {
  1724. + y[i + 0] = tmp[9 + (i + 0)];
  1725. + y[i + 1] = tmp[9 + (i + 1)];
  1726. + y[i + 2] = tmp[9 + (i + 2)];
  1727. + }
  1728. + for (i = 9; i < 27; i += 3) {
  1729. + y[i + 0] = -tmp[36 - (9 + (i + 0)) - 1];
  1730. + y[i + 1] = -tmp[36 - (9 + (i + 1)) - 1];
  1731. + y[i + 2] = -tmp[36 - (9 + (i + 2)) - 1];
  1732. + }
  1733. + for (i = 27; i < 36; i += 3) {
  1734. + y[i + 0] = -tmp[(i + 0) - 27];
  1735. + y[i + 1] = -tmp[(i + 1) - 27];
  1736. + y[i + 2] = -tmp[(i + 2) - 27];
  1737. + }
  1738. + }
  1739. +*/
  1740. +
  1741. + /* Registers used:
  1742. + r0 = tmp[8], r1 = tmp[7], r2 = tmp[6], r3 = tmp[5], r4 = tmp[4]
  1743. + r5 = tmp[3], r6 = tmp[2], r7 = tmp[1], r8 = tmp[0], r12 = y
  1744. +
  1745. + Free registers:
  1746. + r9, r10, r11, lr
  1747. + */
  1748. +
  1749. + ldm sp++, r0-r8 /* Get tmp[0]-tmp[8] from stack */
  1750. + sub r5, r7 /* tmp[3] -= tmp[1]*/
  1751. + sub r3, r5 /* tmp[5] -= tmp[3]*/
  1752. + sub r1, r3 /* tmp[7] -= tmp[5]*/
  1753. +
  1754. + sub r7, r8 /* tmp[1] -= tmp[0]*/
  1755. + sub r6, r7 /* tmp[2] -= tmp[1]*/
  1756. + sub r5, r6 /* tmp[3] -= tmp[2]*/
  1757. + neg r8
  1758. + st.w r12[26*4], r8 /* y[26] = -tmp[0] */
  1759. + st.w r12[27*4], r8 /* y[27] = -tmp[0] */
  1760. + neg r7
  1761. + neg r6
  1762. + st.w r12[25*4], r7 /* y[25] = -tmp[1] */
  1763. + st.w r12[24*4], r6 /* y[24] = -tmp[2] */
  1764. + st.d r12[28*4], r6 /* y[28] = -tmp[1], y[29] = -tmp[2]*/
  1765. +
  1766. + sub r4, r5 /* tmp[4] -= tmp[3]*/
  1767. + sub r3, r4 /* tmp[5] -= tmp[4]*/
  1768. + neg r5
  1769. + neg r4
  1770. + st.w r12[23*4], r5 /* y[23] = -tmp[3] */
  1771. + st.w r12[22*4], r4 /* y[22] = -tmp[4] */
  1772. + st.d r12[30*4], r4 /* y[30] = -tmp[3], y[31] = -tmp[4]*/
  1773. +
  1774. + ldm sp++, r4-r11,lr /* Get tmp[9]-tmp[17] from stack */
  1775. +
  1776. + sub r2, r3 /* tmp[6] -= tmp[5]*/
  1777. +
  1778. + sub lr, r1 /* tmp[9] -= tmp[7]*/
  1779. + sub r10, lr /* tmp[11] -= tmp[9]*/
  1780. + sub r8, r10 /* tmp[13] -= tmp[11]*/
  1781. + sub r6, r8 /* tmp[15] -= tmp[13]*/
  1782. + sub r4, r6 /* tmp[17] -= tmp[15]*/
  1783. +
  1784. + sub r1, r2 /* tmp[7] -= tmp[6]*/
  1785. + sub r0, r1 /* tmp[8] -= tmp[7]*/
  1786. + neg r3
  1787. + neg r2
  1788. + st.w r12[21*4], r3 /* y[21] = -tmp[5] */
  1789. + st.w r12[20*4], r2 /* y[20] = -tmp[6] */
  1790. + st.d r12[32*4], r2 /* y[32] = -tmp[5], y[33] = -tmp[6]*/
  1791. +
  1792. + sub lr, r0 /* tmp[9] -= tmp[8]*/
  1793. + sub r11, lr /* tmp[10] -= tmp[9]*/
  1794. + neg r1
  1795. + neg r0
  1796. + st.w r12[19*4], r1 /* y[19] = -tmp[7] */
  1797. + st.w r12[18*4], r0 /* y[18] = -tmp[8] */
  1798. + st.d r12[34*4], r0 /* y[34] = -tmp[7], y[35] = -tmp[8]*/
  1799. +
  1800. + sub r10, r11 /* tmp[11] -= tmp[10]*/
  1801. + sub r9, r10 /* tmp[12] -= tmp[11]*/
  1802. +
  1803. + st.w r12[0*4], lr /* y[0] = tmp[9]*/
  1804. + neg lr
  1805. + st.w r12[17*4], lr /* y[17] = -tmp[9]*/
  1806. + st.d r12[1*4], r10 /* y[1] = tmp[10], y[2] = tmp[11] */
  1807. + neg r11
  1808. + neg r10
  1809. + st.w r12[16*4], r11 /* y[16] = -tmp[10] */
  1810. + st.w r12[15*4], r10 /* y[15] = -tmp[11] */
  1811. +
  1812. +
  1813. + sub r8, r9 /* tmp[13] -= tmp[12]*/
  1814. + sub r7, r8 /* tmp[14] -= tmp[13]*/
  1815. + st.d r12[3*4], r8 /* y[3] = tmp[12], y[4] = tmp[13] */
  1816. + neg r9
  1817. + neg r8
  1818. + st.w r12[14*4], r9 /* y[14] = -tmp[12] */
  1819. + st.w r12[13*4], r8 /* y[13] = -tmp[13] */
  1820. +
  1821. + sub r6, r7 /* tmp[15] -= tmp[14]*/
  1822. + sub r5, r6 /* tmp[16] -= tmp[15]*/
  1823. + sub r4, r5 /* tmp[17] -= tmp[16]*/
  1824. +
  1825. + st.d r12[5*4], r6 /* y[5] = tmp[14], y[6] = tmp[15] */
  1826. + neg r7
  1827. + neg r6
  1828. + st.w r12[12*4], r7 /* y[12] = -tmp[14] */
  1829. + st.w r12[11*4], r6 /* y[11] = -tmp[15] */
  1830. +
  1831. + st.d r12[7*4], r4 /* y[7] = tmp[16], y[8] = tmp[17] */
  1832. + neg r5
  1833. + neg r4
  1834. + st.w r12[10*4], r5 /* y[10] = -tmp[16] */
  1835. + st.w r12[9*4], r4 /* y[9] = -tmp[17] */
  1836. +
  1837. + popm r0-r7,r11,pc
  1838. +
  1839. + .align 2
  1840. +scale_dctIV:
  1841. + .short MAD_F(0x1ff833fa), MAD_F(0x1fb9ea93), MAD_F(0x1f3dd120)
  1842. + .short MAD_F(0x1e84d969), MAD_F(0x1d906bcf), MAD_F(0x1c62648b)
  1843. + .short MAD_F(0x1afd100f), MAD_F(0x1963268b), MAD_F(0x1797c6a4)
  1844. + .short MAD_F(0x159e6f5b), MAD_F(0x137af940), MAD_F(0x11318ef3)
  1845. + .short MAD_F(0x0ec6a507), MAD_F(0x0c3ef153), MAD_F(0x099f61c5)
  1846. + .short MAD_F(0x06ed12c5), MAD_F(0x042d4544), MAD_F(0x0165547c)
  1847. +
  1848. + .align 2
  1849. +scale_sdctII:
  1850. + .short MAD_F(0x1fe0d3b4), MAD_F(0x1ee8dd47), MAD_F(0x1d007930)
  1851. + .short MAD_F(0x1a367e59), MAD_F(0x16a09e66), MAD_F(0x125abcf8)
  1852. + .short MAD_F(0x0d8616bc), MAD_F(0x08483ee1), MAD_F(0x02c9fad7)
  1853. diff --git a/layer3.c b/layer3.c
  1854. index 4e5d3fa..dffdab3 100644
  1855. --- a/layer3.c
  1856. +++ b/layer3.c
  1857. @@ -378,6 +378,11 @@ mad_fixed_t const ca[8] = {
  1858. -MAD_F(0x003a2847) /* -0.014198569 */, -MAD_F(0x000f27b4) /* -0.003699975 */
  1859. };
  1860. +#ifdef FPM_AVR32
  1861. +# undef MAD_F
  1862. +# define MAD_F(x) ((x + (1 << 12)) >> 13)
  1863. +#endif
  1864. +
  1865. /*
  1866. * IMDCT coefficients for short blocks
  1867. * derived from section 2.4.3.4.10.2 of ISO/IEC 11172-3
  1868. @@ -386,7 +391,7 @@ mad_fixed_t const ca[8] = {
  1869. * imdct_s[i /odd][k] = cos((PI / 24) * (2 * (6 + (i-1)/2) + 7) * (2 * k + 1))
  1870. */
  1871. static
  1872. -mad_fixed_t const imdct_s[6][6] = {
  1873. +mad_coeff_t const imdct_s[6][6] = {
  1874. # include "imdct_s.dat"
  1875. };
  1876. @@ -398,7 +403,7 @@ mad_fixed_t const imdct_s[6][6] = {
  1877. * window_l[i] = sin((PI / 36) * (i + 1/2))
  1878. */
  1879. static
  1880. -mad_fixed_t const window_l[36] = {
  1881. +mad_coeff_t const window_l[36] = {
  1882. MAD_F(0x00b2aa3e) /* 0.043619387 */, MAD_F(0x0216a2a2) /* 0.130526192 */,
  1883. MAD_F(0x03768962) /* 0.216439614 */, MAD_F(0x04cfb0e2) /* 0.300705800 */,
  1884. MAD_F(0x061f78aa) /* 0.382683432 */, MAD_F(0x07635284) /* 0.461748613 */,
  1885. @@ -429,7 +434,7 @@ mad_fixed_t const window_l[36] = {
  1886. * window_s[i] = sin((PI / 12) * (i + 1/2))
  1887. */
  1888. static
  1889. -mad_fixed_t const window_s[12] = {
  1890. +mad_coeff_t const window_s[12] = {
  1891. MAD_F(0x0216a2a2) /* 0.130526192 */, MAD_F(0x061f78aa) /* 0.382683432 */,
  1892. MAD_F(0x09bd7ca0) /* 0.608761429 */, MAD_F(0x0cb19346) /* 0.793353340 */,
  1893. MAD_F(0x0ec835e8) /* 0.923879533 */, MAD_F(0x0fdcf549) /* 0.991444861 */,
  1894. @@ -438,6 +443,11 @@ mad_fixed_t const window_s[12] = {
  1895. MAD_F(0x061f78aa) /* 0.382683432 */, MAD_F(0x0216a2a2) /* 0.130526192 */,
  1896. };
  1897. +#ifdef FPM_AVR32
  1898. +# undef MAD_F
  1899. +# define MAD_F(x) ((mad_fixed_t) (x##L))
  1900. +#endif
  1901. +
  1902. /*
  1903. * coefficients for intensity stereo processing
  1904. * derived from section 2.4.3.4.9.3 of ISO/IEC 11172-3
  1905. @@ -879,6 +889,42 @@ void III_exponents(struct channel const *channel,
  1906. * NAME: III_requantize()
  1907. * DESCRIPTION: requantize one (positive) value
  1908. */
  1909. +
  1910. +#if 0
  1911. +/*static*/
  1912. +mad_fixed_t III_requantize(unsigned int value, signed int exp)
  1913. +{
  1914. + register mad_fixed_t tmp2, tmp3;
  1915. + long long tmp_d;
  1916. +
  1917. + asm ("asr\t%0, %1, 2\n"
  1918. + "ld.w\t%2, %4[%5 << 2]\n"
  1919. + "sub\t%1, %1, %0 << 2\n"
  1920. + "asr\t%3, %2, 7\n"
  1921. + "andl\t%2, 0x7f, COH\n"
  1922. + "add\t%0, %2\n"
  1923. + "lsl\t%m0,%3,%0\n"
  1924. + "neg\t%0\n"
  1925. + "asr\t%3,%3,%0\n"
  1926. + "add\t%2, %6, %1 << 2\n"
  1927. + "ld.w\t%2, %2[12]\n"
  1928. + "cp.w\t%0, 0\n"
  1929. + "movlt\t%3, %m0\n"
  1930. + "muls.d\t%0, %3, %2\n"
  1931. + "cp.w\t%1, 0\n"
  1932. + "breq\t0f\n"
  1933. + "lsr\t%0, %0, 28\n"
  1934. + "or\t%3, %0, %m0 << 4\n"
  1935. + "0:\n"
  1936. + : "=&r"(tmp_d), "+r"(exp), "=&r"(tmp2), "=&r"(tmp3)
  1937. + : "r"(&rq_table), "r"(value), "r"(root_table));
  1938. +
  1939. +
  1940. + return tmp3;
  1941. +}
  1942. +
  1943. +#else
  1944. +
  1945. static
  1946. mad_fixed_t III_requantize(unsigned int value, signed int exp)
  1947. {
  1948. @@ -918,6 +964,7 @@ mad_fixed_t III_requantize(unsigned int value, signed int exp)
  1949. return frac ? mad_f_mul(requantized, root_table[3 + frac]) : requantized;
  1950. }
  1951. +#endif
  1952. /* we must take care that sz >= bits and sz < sizeof(long) lest bits == 0 */
  1953. # define MASK(cache, sz, bits) \
  1954. @@ -2054,27 +2101,42 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36])
  1955. }
  1956. # endif
  1957. +
  1958. +#ifdef FPM_AVR32
  1959. +# undef mad_f_mul
  1960. +# define mad_f_mul(x, y) __builtin_mulsatrndwh_w(x, y)
  1961. +#endif
  1962. +
  1963. /*
  1964. * NAME: III_imdct_l()
  1965. * DESCRIPTION: perform IMDCT and windowing for long blocks
  1966. */
  1967. static
  1968. -void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
  1969. +void III_imdct_l(mad_fixed_t /*const*/ X[18], mad_fixed_t z[36],
  1970. unsigned int block_type)
  1971. {
  1972. unsigned int i;
  1973. + mad_fixed_t *z_ptr;
  1974. + mad_coeff_t *w_ptr;
  1975. /* IMDCT */
  1976. +#ifdef FPM_AVR32
  1977. + imdct36_avr32(X, z);
  1978. +#else
  1979. imdct36(X, z);
  1980. +#endif
  1981. /* windowing */
  1982. + z_ptr = &z[0];
  1983. + w_ptr = &window_l[0];
  1984. +
  1985. switch (block_type) {
  1986. case 0: /* normal window */
  1987. # if defined(ASO_INTERLEAVE1)
  1988. {
  1989. - register mad_fixed_t tmp1, tmp2;
  1990. + register mad_coeff_t tmp1, tmp2;
  1991. tmp1 = window_l[0];
  1992. tmp2 = window_l[1];
  1993. @@ -2091,15 +2153,16 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
  1994. }
  1995. # elif defined(ASO_INTERLEAVE2)
  1996. {
  1997. - register mad_fixed_t tmp1, tmp2;
  1998. + register mad_fixed_t tmp1;
  1999. + register mad_coeff_t tmp2;
  2000. - tmp1 = z[0];
  2001. - tmp2 = window_l[0];
  2002. + tmp1 = *z_ptr;
  2003. + tmp2 = *w_ptr++;
  2004. for (i = 0; i < 35; ++i) {
  2005. - z[i] = mad_f_mul(tmp1, tmp2);
  2006. - tmp1 = z[i + 1];
  2007. - tmp2 = window_l[i + 1];
  2008. + *z_ptr++ = mad_f_mul(tmp1, tmp2);
  2009. + tmp1 = *z_ptr;
  2010. + tmp2 = *w_ptr++;
  2011. }
  2012. z[35] = mad_f_mul(tmp1, tmp2);
  2013. @@ -2118,23 +2181,28 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
  2014. case 1: /* start block */
  2015. for (i = 0; i < 18; i += 3) {
  2016. - z[i + 0] = mad_f_mul(z[i + 0], window_l[i + 0]);
  2017. - z[i + 1] = mad_f_mul(z[i + 1], window_l[i + 1]);
  2018. - z[i + 2] = mad_f_mul(z[i + 2], window_l[i + 2]);
  2019. + *(z_ptr++) = mad_f_mul(*z_ptr, *w_ptr++);
  2020. + *(z_ptr++) = mad_f_mul(*z_ptr, *w_ptr++);
  2021. + *(z_ptr++) = mad_f_mul(*z_ptr, *w_ptr++);
  2022. }
  2023. + z_ptr += 6;
  2024. + w_ptr = &window_s[6];
  2025. /* (i = 18; i < 24; ++i) z[i] unchanged */
  2026. - for (i = 24; i < 30; ++i) z[i] = mad_f_mul(z[i], window_s[i - 18]);
  2027. - for (i = 30; i < 36; ++i) z[i] = 0;
  2028. + for (i = 24; i < 30; ++i) *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
  2029. + for (i = 30; i < 36; ++i) *z_ptr++ = 0;
  2030. break;
  2031. case 3: /* stop block */
  2032. - for (i = 0; i < 6; ++i) z[i] = 0;
  2033. - for (i = 6; i < 12; ++i) z[i] = mad_f_mul(z[i], window_s[i - 6]);
  2034. + w_ptr = &window_s[0];
  2035. + for (i = 0; i < 6; ++i) *z_ptr++ = 0;
  2036. + for (i = 6; i < 12; ++i) *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
  2037. /* (i = 12; i < 18; ++i) z[i] unchanged */
  2038. + w_ptr = &window_l[18];
  2039. + z_ptr += 6;
  2040. for (i = 18; i < 36; i += 3) {
  2041. - z[i + 0] = mad_f_mul(z[i + 0], window_l[i + 0]);
  2042. - z[i + 1] = mad_f_mul(z[i + 1], window_l[i + 1]);
  2043. - z[i + 2] = mad_f_mul(z[i + 2], window_l[i + 2]);
  2044. + *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++ );
  2045. + *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
  2046. + *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
  2047. }
  2048. break;
  2049. }
  2050. @@ -2146,10 +2214,10 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
  2051. * DESCRIPTION: perform IMDCT and windowing for short blocks
  2052. */
  2053. static
  2054. -void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
  2055. +void III_imdct_s(mad_fixed_t /*const*/ X[18], mad_fixed_t z[36])
  2056. {
  2057. mad_fixed_t y[36], *yptr;
  2058. - mad_fixed_t const *wptr;
  2059. + mad_coeff_t const *wptr;
  2060. int w, i;
  2061. register mad_fixed64hi_t hi;
  2062. register mad_fixed64lo_t lo;
  2063. @@ -2159,11 +2227,56 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
  2064. yptr = &y[0];
  2065. for (w = 0; w < 3; ++w) {
  2066. - register mad_fixed_t const (*s)[6];
  2067. + register mad_coeff_t const (*s)[6];
  2068. s = imdct_s;
  2069. for (i = 0; i < 3; ++i) {
  2070. +#ifdef FPM_AVR32
  2071. + register long long int acc, tmp1, tmp2, tmp3, tmp4;
  2072. + asm volatile ("ld.d\t%0, %5++\n"
  2073. + "ld.d\t%1, %6[0]\n"
  2074. + "ld.d\t%2, %6[2*4]\n"
  2075. + "ld.d\t%3, %6[4*4]\n"
  2076. + "mulwh.d\t%4, %m1, %m0:t\n"
  2077. + "macwh.d\t%4, %1, %m0:b\n"
  2078. + "ld.w\t%m0, %5++\n"
  2079. + "macwh.d\t%4, %m2, %0:t\n"
  2080. + "macwh.d\t%4, %2, %0:b\n"
  2081. + "macwh.d\t%4, %m3, %m0:t\n"
  2082. + "macwh.d\t%4, %3, %m0:b\n"
  2083. + "ld.d\t%0, %5++\n"
  2084. + "rol\t%4\n"
  2085. + "rol\t%m4\n"
  2086. + : "=&r"(tmp1), "=&r"(tmp2), "=&r"(tmp3), "=&r"(tmp4),
  2087. + "=&r"(acc), "+r"(s)
  2088. + : "r"(X));
  2089. +
  2090. + asm volatile ("st.w\t%1[0], %m0\n"
  2091. + "neg\t%m0\n"
  2092. + "st.w\t%2[5*4], %m0\n"
  2093. + : "+r"(acc)
  2094. + : "r"(&yptr[i]), "r"(&yptr[-i]));
  2095. +
  2096. + asm volatile ("mulwh.d\t%4, %m1, %m0:t\n"
  2097. + "macwh.d\t%4, %1, %m0:b\n"
  2098. + "ld.w\t%m0, %5++\n"
  2099. + "macwh.d\t%4, %m2, %0:t\n"
  2100. + "macwh.d\t%4, %2, %0:b\n"
  2101. + "macwh.d\t%4, %m3, %m0:t\n"
  2102. + "macwh.d\t%4, %3, %m0:b\n"
  2103. + "rol\t%4\n"
  2104. + "rol\t%m4\n"
  2105. + : "+r"(tmp1), "+r"(tmp2), "+r"(tmp3), "+r"(tmp4),
  2106. + "=&r"(acc), "+r"(s)
  2107. + : "r"(X));
  2108. +
  2109. + asm volatile ( "st.w\t%1[6*4], %m0\n"
  2110. + "st.w\t%2[11*4], %m0\n"
  2111. + :: "r"(acc), "r"(&yptr[i]), "r"(&yptr[-i]));
  2112. +
  2113. +
  2114. +#else
  2115. MAD_F_ML0(hi, lo, X[0], (*s)[0]);
  2116. MAD_F_MLA(hi, lo, X[1], (*s)[1]);
  2117. MAD_F_MLA(hi, lo, X[2], (*s)[2]);
  2118. @@ -2187,6 +2300,7 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
  2119. yptr[11 - i] = yptr[i + 6];
  2120. ++s;
  2121. +#endif
  2122. }
  2123. yptr += 12;
  2124. @@ -2198,6 +2312,196 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
  2125. yptr = &y[0];
  2126. wptr = &window_s[0];
  2127. +#ifdef FPM_AVR32
  2128. + /* z[0] = 0;
  2129. + z[1] = 0;
  2130. + z[2] = 0;
  2131. + z[3] = 0;
  2132. + z[4] = 0;
  2133. + z[5] = 0;
  2134. + z[30] = 0;
  2135. + z[31] = 0;
  2136. + z[32] = 0;
  2137. + z[33] = 0;
  2138. + z[34] = 0;
  2139. + z[35] = 0;
  2140. + */
  2141. + {
  2142. + register long long int tmp, tmp2, tmp3, w0123, w4567, w891011;
  2143. + asm volatile ("mov\t%m0, 0\n"
  2144. + "mov\t%0, %m0\n"
  2145. + "st.d\t%1[0], %0\n"
  2146. + "st.d\t%1[2*4], %0\n"
  2147. + "st.d\t%1[4*4], %0\n"
  2148. + "st.d\t%1[30*4], %0\n"
  2149. + "st.d\t%1[32*4], %0\n"
  2150. + "st.d\t%1[34*4], %0\n"
  2151. + : "=&r"(tmp) : "r"(z));
  2152. +
  2153. +
  2154. +
  2155. + /*
  2156. + z[6] = mad_f_mul(yptr [0], wptr[0]);
  2157. + z[7] = mad_f_mul(yptr [1], wptr[1]);
  2158. + z[8] = mad_f_mul(yptr [2], wptr[2]);
  2159. + z[9] = mad_f_mul(yptr [3], wptr[3]);
  2160. + z[10] = mad_f_mul(yptr[4], wptr[4]);
  2161. + z[11] = mad_f_mul(yptr[5], wptr[5]);
  2162. + z[24] = mad_f_mul(yptr [30], wptr[6]);
  2163. + z[25] = mad_f_mul(yptr [31], wptr[7]);
  2164. + z[26] = mad_f_mul(yptr [32], wptr[8]);
  2165. + z[27] = mad_f_mul(yptr [33], wptr[9]);
  2166. + z[28] = mad_f_mul(yptr[34], wptr[10]);
  2167. + z[29] = mad_f_mul(yptr[35], wptr[11]);
  2168. + */
  2169. +
  2170. +
  2171. + asm volatile ("ld.d\t%0, %5[0*4]\n"
  2172. + "ld.d\t%3, %6[0*4]\n"
  2173. + "ld.d\t%1, %5[2*4]\n"
  2174. + "ld.d\t%2, %5[4*4]\n"
  2175. + "mulsatrndwh.w\t%m3, %m3, %m0:t\n"
  2176. + "mulsatrndwh.w\t%3, %3, %m0:b\n"
  2177. + "ld.d\t%4, %6[2*4]\n"
  2178. + "st.d\t%7[6*4], %3\n"
  2179. +
  2180. + "mulsatrndwh.w\t%m4, %m4, %0:t\n"
  2181. + "mulsatrndwh.w\t%4, %4, %0:b\n"
  2182. + "ld.d\t%3, %6[4*4]\n"
  2183. + "st.d\t%7[8*4], %4\n"
  2184. +
  2185. + "mulsatrndwh.w\t%m3, %m3, %m1:t\n"
  2186. + "mulsatrndwh.w\t%3, %3, %m1:b\n"
  2187. + "ld.d\t%4, %6[30*4]\n"
  2188. + "st.d\t%7[10*4], %3\n"
  2189. +
  2190. + "mulsatrndwh.w\t%m4, %m4, %1:t\n"
  2191. + "mulsatrndwh.w\t%4, %4, %1:b\n"
  2192. + "ld.d\t%3, %6[32*4]\n"
  2193. + "st.d\t%7[24*4], %4\n"
  2194. +
  2195. + "mulsatrndwh.w\t%m3, %m3, %m2:t\n"
  2196. + "mulsatrndwh.w\t%3, %3, %m2:b\n"
  2197. + "ld.d\t%4, %6[34*4]\n"
  2198. + "st.d\t%7[26*4], %3\n"
  2199. +
  2200. + "mulsatrndwh.w\t%m4, %m4, %2:t\n"
  2201. + "mulsatrndwh.w\t%4, %4, %2:b\n"
  2202. + "st.d\t%7[28*4], %4\n"
  2203. +
  2204. + : "=&r"(w0123), "=&r"(w4567), "=&r"(w891011), "=&r"(tmp), "=&r"(tmp2)
  2205. + : "r"(wptr), "r"(yptr), "r"(z));
  2206. + /*
  2207. + MAD_F_ML0(hi, lo, yptr[6], wptr[6]);
  2208. + MAD_F_MLA(hi, lo, yptr[12], wptr[0]);
  2209. + z[12] = MAD_F_MLZ(hi, lo);
  2210. + MAD_F_ML0(hi, lo, yptr[7], wptr[7]);
  2211. + MAD_F_MLA(hi, lo, yptr[13], wptr[1]);
  2212. + z[13] = MAD_F_MLZ(hi, lo);
  2213. + MAD_F_ML0(hi, lo, yptr[8], wptr[8]);
  2214. + MAD_F_MLA(hi, lo, yptr[14], wptr[2]);
  2215. + z[14] = MAD_F_MLZ(hi, lo);
  2216. + MAD_F_ML0(hi, lo, yptr[9], wptr[9]);
  2217. + MAD_F_MLA(hi, lo, yptr[15], wptr[3]);
  2218. + z[15] = MAD_F_MLZ(hi, lo);
  2219. + MAD_F_ML0(hi, lo, yptr[10], wptr[10]);
  2220. + MAD_F_MLA(hi, lo, yptr[16], wptr[4]);
  2221. + z[16] = MAD_F_MLZ(hi, lo);
  2222. + MAD_F_ML0(hi, lo, yptr[11], wptr[11]);
  2223. + MAD_F_MLA(hi, lo, yptr[17], wptr[5]);
  2224. + z[17] = MAD_F_MLZ(hi, lo);
  2225. +
  2226. + MAD_F_ML0(hi, lo, yptr[18], wptr[6]);
  2227. + MAD_F_MLA(hi, lo, yptr[24], wptr[0]);
  2228. + z[18] = MAD_F_MLZ(hi, lo);
  2229. + MAD_F_ML0(hi, lo, yptr[19], wptr[7]);
  2230. + MAD_F_MLA(hi, lo, yptr[25], wptr[1]);
  2231. + z[19] = MAD_F_MLZ(hi, lo);
  2232. + MAD_F_ML0(hi, lo, yptr[20], wptr[8]);
  2233. + MAD_F_MLA(hi, lo, yptr[26], wptr[2]);
  2234. + z[20] = MAD_F_MLZ(hi, lo);
  2235. + MAD_F_ML0(hi, lo, yptr[21], wptr[9]);
  2236. + MAD_F_MLA(hi, lo, yptr[27], wptr[3]);
  2237. + z[21] = MAD_F_MLZ(hi, lo);
  2238. + MAD_F_ML0(hi, lo, yptr[22], wptr[10]);
  2239. + MAD_F_MLA(hi, lo, yptr[28], wptr[4]);
  2240. + z[22] = MAD_F_MLZ(hi, lo);
  2241. + MAD_F_ML0(hi, lo, yptr[23], wptr[11]);
  2242. + MAD_F_MLA(hi, lo, yptr[29], wptr[5]);
  2243. + z[23] = MAD_F_MLZ(hi, lo);*/
  2244. +
  2245. +
  2246. + asm volatile ("ld.d\t%0, %3[6*4]\n"
  2247. + "ld.d\t%1, %3[12*4]\n"
  2248. + "mulwh.d\t%2, %m0, %5:t\n"
  2249. + "macwh.d\t%2, %m1, %m4:t\n"
  2250. + "mulwh.d\t%0, %0, %5:b\n"
  2251. + "macwh.d\t%0, %1, %m4:b\n"
  2252. + "lsl\t%m2, 1\n"
  2253. + "lsl\t%2, %m0, 1\n"
  2254. + "st.d\t%6[12*4], %2\n"
  2255. +
  2256. + "ld.d\t%0, %3[18*4]\n"
  2257. + "ld.d\t%1, %3[24*4]\n"
  2258. + "mulwh.d\t%2, %m0, %5:t\n"
  2259. + "macwh.d\t%2, %m1, %m4:t\n"
  2260. + "mulwh.d\t%0, %0, %5:b\n"
  2261. + "macwh.d\t%0, %1, %m4:b\n"
  2262. + "lsl\t%m2, 1\n"
  2263. + "lsl\t%2, %m0, 1\n"
  2264. + "st.d\t%6[18*4], %2\n"
  2265. +
  2266. + : "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
  2267. + : "r"(yptr), "r"(w0123), "r"(w4567), "r"(z));
  2268. +
  2269. + asm volatile ("ld.d\t%0, %3[8*4]\n"
  2270. + "ld.d\t%1, %3[14*4]\n"
  2271. + "mulwh.d\t%2, %m0, %m5:t\n"
  2272. + "macwh.d\t%2, %m1, %4:t\n"
  2273. + "mulwh.d\t%0, %0, %m5:b\n"
  2274. + "macwh.d\t%0, %1, %4:b\n"
  2275. + "lsl\t%m2, 1\n"
  2276. + "lsl\t%2, %m0, 1\n"
  2277. + "st.d\t%6[14*4], %2\n"
  2278. +
  2279. + "ld.d\t%0, %3[20*4]\n"
  2280. + "ld.d\t%1, %3[26*4]\n"
  2281. + "mulwh.d\t%2, %m0, %m5:t\n"
  2282. + "macwh.d\t%2, %m1, %4:t\n"
  2283. + "mulwh.d\t%0, %0, %m5:b\n"
  2284. + "macwh.d\t%0, %1, %4:b\n"
  2285. + "lsl\t%m2, 1\n"
  2286. + "lsl\t%2, %m0, 1\n"
  2287. + "st.d\t%6[20*4], %2\n"
  2288. +
  2289. + : "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
  2290. + : "r"(yptr), "r"(w0123), "r"(w891011), "r"(z));
  2291. +
  2292. + asm volatile ("ld.d\t%0, %3[10*4]\n"
  2293. + "ld.d\t%1, %3[16*4]\n"
  2294. + "mulwh.d\t%2, %m0, %5:t\n"
  2295. + "macwh.d\t%2, %m1, %m4:t\n"
  2296. + "mulwh.d\t%0, %0, %5:b\n"
  2297. + "macwh.d\t%0, %1, %m4:b\n"
  2298. + "lsl\t%m2, 1\n"
  2299. + "lsl\t%2, %m0, 1\n"
  2300. + "st.d\t%6[16*4], %2\n"
  2301. +
  2302. + "ld.d\t%0, %3[22*4]\n"
  2303. + "ld.d\t%1, %3[28*4]\n"
  2304. + "mulwh.d\t%2, %m0, %5:t\n"
  2305. + "macwh.d\t%2, %m1, %m4:t\n"
  2306. + "mulwh.d\t%0, %0, %5:b\n"
  2307. + "macwh.d\t%0, %1, %m4:b\n"
  2308. + "lsl\t%m2, 1\n"
  2309. + "lsl\t%2, %m0, 1\n"
  2310. + "st.d\t%6[22*4], %2\n"
  2311. +
  2312. + : "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
  2313. + : "r"(yptr), "r"(w4567), "r"(w891011), "r"(z));
  2314. +
  2315. + }
  2316. +#else
  2317. for (i = 0; i < 6; ++i) {
  2318. z[i + 0] = 0;
  2319. z[i + 6] = mad_f_mul(yptr[ 0 + 0], wptr[0]);
  2320. @@ -2218,8 +2522,15 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
  2321. ++yptr;
  2322. ++wptr;
  2323. }
  2324. +#endif
  2325. }
  2326. +#ifdef FPM_AVR32
  2327. +# undef mad_f_mul
  2328. +# define mad_f_mul(x, y) ((((x) + (1L << 11)) >> 12) * \
  2329. + (((y) + (1L << 15)) >> 16))
  2330. +#endif
  2331. +
  2332. /*
  2333. * NAME: III_overlap()
  2334. * DESCRIPTION: perform overlap-add of windowed IMDCT outputs
  2335. diff --git a/synth.c b/synth.c
  2336. index 1d28d43..f42d49b 100644
  2337. --- a/synth.c
  2338. +++ b/synth.c
  2339. @@ -29,20 +29,6 @@
  2340. # include "frame.h"
  2341. # include "synth.h"
  2342. -/*
  2343. - * NAME: synth->init()
  2344. - * DESCRIPTION: initialize synth struct
  2345. - */
  2346. -void mad_synth_init(struct mad_synth *synth)
  2347. -{
  2348. - mad_synth_mute(synth);
  2349. -
  2350. - synth->phase = 0;
  2351. -
  2352. - synth->pcm.samplerate = 0;
  2353. - synth->pcm.channels = 0;
  2354. - synth->pcm.length = 0;
  2355. -}
  2356. /*
  2357. * NAME: synth->mute()
  2358. @@ -88,6 +74,10 @@ void mad_synth_mute(struct mad_synth *synth)
  2359. /* FPM_DEFAULT without OPT_SSO will actually lose accuracy and performance */
  2360. +# if defined(FPM_AVR32)
  2361. +# define OPT_SSO
  2362. +# endif
  2363. +
  2364. # if defined(FPM_DEFAULT) && !defined(OPT_SSO)
  2365. # define OPT_SSO
  2366. # endif
  2367. @@ -522,9 +512,15 @@ void dct32(mad_fixed_t const in[32], unsigned int slot,
  2368. # endif
  2369. # define ML0(hi, lo, x, y) ((lo) = (x) * (y))
  2370. # define MLA(hi, lo, x, y) ((lo) += (x) * (y))
  2371. -# define MLN(hi, lo) ((lo) = -(lo))
  2372. -# define MLZ(hi, lo) ((void) (hi), (mad_fixed_t) (lo))
  2373. -# define SHIFT(x) ((x) >> 2)
  2374. +# if defined(FPM_AVR32)
  2375. +# define MLN(hi, lo) MAD_F_MLN((hi), (lo))
  2376. +# define MLZ(hi, lo) (hi)
  2377. +# define SHIFT(x) ((x) << 2)
  2378. +# else
  2379. +# define MLN(hi, lo) ((lo) = -(lo))
  2380. +# define MLZ(hi, lo) ((void) (hi), (mad_fixed_t) (lo))
  2381. +# define SHIFT(x) ((x) >> 2)
  2382. +# endif
  2383. # define PRESHIFT(x) ((MAD_F(x) + (1L << 13)) >> 14)
  2384. # else
  2385. # define ML0(hi, lo, x, y) MAD_F_ML0((hi), (lo), (x), (y))
  2386. @@ -541,11 +537,54 @@ void dct32(mad_fixed_t const in[32], unsigned int slot,
  2387. # endif
  2388. # endif
  2389. +/*
  2390. + * NAME: synth->init()
  2391. + * DESCRIPTION: initialize synth struct
  2392. + */
  2393. +
  2394. +#ifdef FPM_AVR32
  2395. +short Dmod[17][33];
  2396. +#endif
  2397. +
  2398. static
  2399. +#ifdef FPM_AVR32
  2400. +short const D[17][32] = {
  2401. +#else
  2402. mad_fixed_t const D[17][32] = {
  2403. +#endif
  2404. # include "D.dat"
  2405. };
  2406. +void mad_synth_init(struct mad_synth *synth)
  2407. +{
  2408. +
  2409. + mad_synth_mute(synth);
  2410. +
  2411. + synth->phase = 0;
  2412. +
  2413. + synth->pcm.samplerate = 0;
  2414. + synth->pcm.channels = 0;
  2415. + synth->pcm.length = 0;
  2416. +
  2417. +#ifdef FPM_AVR32
  2418. + {
  2419. + int i, j;
  2420. + for ( i = 0; i < 17; i++ ){
  2421. + for ( j = 0; j < 32; j++ ){
  2422. + if ( j & 1 ){
  2423. + Dmod[i][17 + (j >> 1)]= D[i][j];
  2424. + } else {
  2425. + Dmod[i][(j >> 1)]= D[i][j];
  2426. + }
  2427. + }
  2428. +
  2429. + Dmod[i][16]= Dmod[i][16+8];
  2430. + }
  2431. + }
  2432. +#endif
  2433. +
  2434. +}
  2435. +
  2436. # if defined(ASO_SYNTH)
  2437. void synth_full(struct mad_synth *, struct mad_frame const *,
  2438. unsigned int, unsigned int);
  2439. @@ -560,9 +599,13 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
  2440. {
  2441. unsigned int phase, ch, s, sb, pe, po;
  2442. mad_fixed_t *pcm1, *pcm2, (*filter)[2][2][16][8];
  2443. - mad_fixed_t const (*sbsample)[36][32];
  2444. + mad_fixed_t /*const*/ (*sbsample)[36][32];
  2445. register mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8];
  2446. +#ifdef FPM_AVR32
  2447. + register short const (*Dptr)[32], *ptr;
  2448. +#else
  2449. register mad_fixed_t const (*Dptr)[32], *ptr;
  2450. +#endif
  2451. register mad_fixed64hi_t hi;
  2452. register mad_fixed64lo_t lo;
  2453. @@ -573,6 +616,20 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
  2454. pcm1 = synth->pcm.samples[ch];
  2455. for (s = 0; s < ns; ++s) {
  2456. +# ifdef FPM_AVR32
  2457. +/*
  2458. + int i;
  2459. + for ( i = 0; i < 32; i++ ){
  2460. + (*sbsample)[s][i] = ((*sbsample)[s][i] + (1 << 13)) & 0xFFFFC000;
  2461. + }
  2462. +*/
  2463. + dct32_avr32((*sbsample)[s], phase >> 1,
  2464. + (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
  2465. + /* printf("dct32: %d\n", GET_CYCLES);*/
  2466. + pcm1 = synth_avr32(phase, (mad_fixed_t *)filter, \
  2467. + pcm1, (short *)&Dmod[0]);
  2468. + /* printf("synth_window: %d\n", GET_CYCLES);*/
  2469. +# else
  2470. dct32((*sbsample)[s], phase >> 1,
  2471. (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
  2472. @@ -679,6 +736,7 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
  2473. MLA(hi, lo, (*fo)[7], ptr[ 2]);
  2474. *pcm1 = SHIFT(-MLZ(hi, lo));
  2475. +# endif
  2476. pcm1 += 16;
  2477. phase = (phase + 1) % 16;
  2478. diff --git a/synth_avr32.S b/synth_avr32.S
  2479. new file mode 100644
  2480. index 0000000..701077b
  2481. --- /dev/null
  2482. +++ b/synth_avr32.S
  2483. @@ -0,0 +1,394 @@
  2484. +/*
  2485. + Optimized function for speeding up synthesis filter
  2486. + in MPEG Audio Decoding.
  2487. + Copyright 2003-2006 Atmel Corporation.
  2488. +
  2489. + Written by Ronny Pedersen and Lars Even Almås, Atmel Norway
  2490. +
  2491. + This program is free software; you can redistribute it and/or modify
  2492. + it under the terms of the GNU General Public License as published by
  2493. + the Free Software Foundation; either version 2 of the License, or
  2494. + (at your option) any later version.
  2495. +
  2496. + This program is distributed in the hope that it will be useful,
  2497. + but WITHOUT ANY WARRANTY; without even the implied warranty of
  2498. + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  2499. + GNU General Public License for more details.
  2500. +
  2501. + You should have received a copy of the GNU General Public License
  2502. + along with this program; if not, write to the Free Software
  2503. + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
  2504. +
  2505. +
  2506. +/* *****************
  2507. + Defining macros
  2508. + ***************** */
  2509. +
  2510. + .macro window_1 f, ptr, acc, ptr_offset, mul, tmp1_lo, tmp1_hi, tmp2_lo, tmp2_hi, tmp3_lo, tmp3_hi
  2511. + ld.d \tmp1_lo, \f[0*4] /* tmp1 = { f[0], f[1] } */
  2512. + ld.w \tmp2_lo, \ptr[0*2+\ptr_offset*2] /* tmp2_lo = { ptr[0], ptr[1] }*/
  2513. + ld.d \tmp3_lo, \f[6*4] /* tmp3 = { f[6], f[7] } */
  2514. + ld.w \tmp2_hi, \ptr[6*2+\ptr_offset*2] /* tmp2_hi = { ptr[6], ptr[7] }*/
  2515. + .if \mul
  2516. + mulwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[0]*/
  2517. + .else
  2518. + macwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[0]*/
  2519. + .endif
  2520. + macwh.d \acc, \tmp3_lo, \tmp2_lo:b /* f[7] * ptr[1]*/
  2521. + ld.w \tmp2_lo, \ptr[2*2+\ptr_offset*2] /* tmp2_lo = { ptr[2], ptr[3] }*/
  2522. + macwh.d \acc, \tmp1_lo, \tmp2_hi:b /* f[1] * ptr[7]*/
  2523. + ld.d \tmp1_lo, \f[2*4] /* tmp1 = { f[2], f[3] } */
  2524. +
  2525. + macwh.d \acc, \tmp3_hi, \tmp2_lo:t /* f[6] * ptr[2]*/
  2526. + macwh.d \acc, \tmp1_hi, \tmp2_hi:t /* f[2] * ptr[6]*/
  2527. + ld.d \tmp3_lo, \f[4*4] /* tmp3 = { f[4], f[5] } */
  2528. + ld.w \tmp2_hi, \ptr[4*2+\ptr_offset*2] /* tmp2_hi = { ptr[4], ptr[5] }*/
  2529. + macwh.d \acc, \tmp3_lo, \tmp2_lo:b /* f[5] * ptr[3]*/
  2530. +
  2531. + macwh.d \acc, \tmp1_lo, \tmp2_hi:b /* f[3] * ptr[5]*/
  2532. + macwh.d \acc, \tmp3_hi, \tmp2_hi:t /* f[4] * ptr[4]*/
  2533. + .endm
  2534. +
  2535. + .macro window_2 f, ptr, acc, ptr_offset, mul, tmp1_lo, tmp1_hi, tmp2_lo, tmp2_hi, tmp3_lo, tmp3_hi
  2536. + ld.d \tmp1_lo, \f[0*4] /* tmp1 = { f[0], f[1] } */
  2537. + ld.w \tmp2_lo, \ptr[7*2+\ptr_offset*2] /* tmp2_lo = { ptr[7], ptr[8] }*/
  2538. + ld.d \tmp3_lo, \f[2*4] /* tmp3 = { f[2], f[3] } */
  2539. + ld.w \tmp2_hi, \ptr[9*2+\ptr_offset*2] /* tmp2_hi = { ptr[9], ptr[10] }*/
  2540. + .if \mul
  2541. + mulwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[7]*/
  2542. + .else
  2543. + macwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[7]*/
  2544. + .endif
  2545. + macwh.d \acc, \tmp1_lo, \tmp2_lo:b /* f[1] * ptr[8]*/
  2546. +
  2547. + ld.d \tmp1_lo, \f[4*4] /* tmp1 = { f[4], f[5] } */
  2548. + ld.w \tmp2_lo, \ptr[11*2+\ptr_offset*2] /* tmp2_lo = { ptr[11], ptr[12] }*/
  2549. +
  2550. + macwh.d \acc, \tmp3_hi, \tmp2_hi:t /* f[2] * ptr[9]*/
  2551. + macwh.d \acc, \tmp3_lo, \tmp2_hi:b /* f[3] * ptr[10]*/
  2552. +
  2553. + ld.d \tmp3_lo, \f[6*4] /* tmp3 = { f[6], f[7] } */
  2554. + ld.w \tmp2_hi, \ptr[13*2+\ptr_offset*2] /* tmp2_hi = { ptr[13], ptr[14] }*/
  2555. +
  2556. + macwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[4] * ptr[11]*/
  2557. + macwh.d \acc, \tmp1_lo, \tmp2_lo:b /* f[5] * ptr[12]*/
  2558. + macwh.d \acc, \tmp3_hi, \tmp2_hi:t /* f[6] * ptr[13]*/
  2559. + macwh.d \acc, \tmp3_lo, \tmp2_hi:b /* f[7] * ptr[14]*/
  2560. + .endm
  2561. +
  2562. + .macro scale res, d_lo, d_hi
  2563. + lsl \d_hi, 2
  2564. + .endm
  2565. +
  2566. +/* **********************
  2567. + Starting main function
  2568. + ********************** */
  2569. +
  2570. +/* Function synth_avr32 is called from synth.c with arguments:
  2571. + phase, filter, *pcm1, &D[0] */
  2572. +
  2573. + .global synth_avr32
  2574. +synth_avr32:
  2575. + pushm r0-r7, lr
  2576. + sub sp, 8
  2577. +
  2578. + /* R12 = phase, R11 = filter, R10 = pcm1, r9 = D*/
  2579. + bld r12, 0
  2580. + brcc synth_even
  2581. +
  2582. + /* Filter for odd phases */
  2583. +
  2584. + /* fe = &(*filter)[0][1][0];
  2585. + fx = &(*filter)[0][0][0];
  2586. + fo = &(*filter)[1][0][0]; */
  2587. + sub lr /*fe*/, r11, -16*8*4
  2588. + sub r8 /*fo*/, r11, -16*8*4*2
  2589. +
  2590. + /* pe = phase >> 1; */
  2591. + lsr r12, 1
  2592. + stdsp sp[4], r12
  2593. + /* ptr = (short const *)Dmod + pe; */
  2594. + add r12, r9, r12 << 1
  2595. +
  2596. + /* ML0(hi, lo, (*fx)[0], ptr[0 + 17]);
  2597. + MLA(hi, lo, (*fx)[1], ptr[7 + 17]);
  2598. + MLA(hi, lo, (*fx)[2], ptr[6 + 17]);
  2599. + MLA(hi, lo, (*fx)[3], ptr[5 + 17]);
  2600. + MLA(hi, lo, (*fx)[4], ptr[4 + 17]);
  2601. + MLA(hi, lo, (*fx)[5], ptr[3 + 17]);
  2602. + MLA(hi, lo, (*fx)[6], ptr[2 + 17]);
  2603. + MLA(hi, lo, (*fx)[7], ptr[1 + 17]); */
  2604. + window_1 r11/*fx*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
  2605. +
  2606. + /* MLN(hi, lo); */
  2607. + neg r0
  2608. + acr r1
  2609. + neg r1
  2610. +
  2611. + /* MLA(hi, lo, (*fe)[0], ptr[0]);
  2612. + MLA(hi, lo, (*fe)[1], ptr[7]);
  2613. + MLA(hi, lo, (*fe)[2], ptr[6]);
  2614. + MLA(hi, lo, (*fe)[3], ptr[5]);
  2615. + MLA(hi, lo, (*fe)[4], ptr[4]);
  2616. + MLA(hi, lo, (*fe)[5], ptr[3]);
  2617. + MLA(hi, lo, (*fe)[6], ptr[2]);
  2618. + MLA(hi, lo, (*fe)[7], ptr[1]); */
  2619. + window_1 lr/*fe*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
  2620. +
  2621. + /* *pcm1++ = SHIFT(MLZ(hi, lo));
  2622. +
  2623. + pcm2 = pcm1 + 31; */
  2624. + scale r1, r0, r1
  2625. + st.w r10/*pcm_1*/++, r1
  2626. + sub r11/*pcm2*/, r10, -4*31
  2627. +
  2628. + /* for (sb = 1; sb < 16; ++sb) { */
  2629. + mov r2, 15
  2630. + stdsp sp[0], r2
  2631. +odd_loop:
  2632. + /* ++fe;
  2633. + ptr += 33; */
  2634. + sub lr /*fe*/, -8*4
  2635. + sub r12, -33*2
  2636. +
  2637. + /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
  2638. + MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
  2639. + MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
  2640. + MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
  2641. + MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
  2642. + MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
  2643. + MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
  2644. + MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
  2645. + window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
  2646. + /* MLN(hi, lo); */
  2647. +
  2648. + neg r0
  2649. + acr r1
  2650. + neg r1
  2651. +
  2652. + /* MLA(hi, lo, (*fe)[7], ptr[1]);
  2653. + MLA(hi, lo, (*fe)[6], ptr[2]);
  2654. + MLA(hi, lo, (*fe)[5], ptr[3]);
  2655. + MLA(hi, lo, (*fe)[4], ptr[4]);
  2656. + MLA(hi, lo, (*fe)[3], ptr[5]);
  2657. + MLA(hi, lo, (*fe)[2], ptr[6]);
  2658. + MLA(hi, lo, (*fe)[1], ptr[7]);
  2659. + MLA(hi, lo, (*fe)[0], ptr[0]); */
  2660. + window_1 lr/*fe*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
  2661. +
  2662. + /* ptr -= 2*pe; */
  2663. + lddsp r2, sp[4]
  2664. +
  2665. + /* *pcm1++ = SHIFT(MLZ(hi, lo)); */
  2666. +
  2667. + scale r1, r0, r1
  2668. + sub r12/*ptr*/, r12, r2/*pe*/<< 2
  2669. + st.w r10/*pcm_1*/++, r1
  2670. +
  2671. +
  2672. + /* ML0(hi, lo, (*fe)[0], ptr[7 + 17]);
  2673. + MLA(hi, lo, (*fe)[1], ptr[8 + 17]);
  2674. + MLA(hi, lo, (*fe)[2], ptr[9 + 17]);
  2675. + MLA(hi, lo, (*fe)[3], ptr[10 + 17]);
  2676. + MLA(hi, lo, (*fe)[4], ptr[11 + 17]);
  2677. + MLA(hi, lo, (*fe)[5], ptr[12 + 17]);
  2678. + MLA(hi, lo, (*fe)[6], ptr[13 + 17]);
  2679. + MLA(hi, lo, (*fe)[7], ptr[14 + 17]); */
  2680. + window_2 lr/*fe*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
  2681. + /* MLA(hi, lo, (*fo)[7], ptr[14]);
  2682. + MLA(hi, lo, (*fo)[6], ptr[13]);
  2683. + MLA(hi, lo, (*fo)[5], ptr[12]);
  2684. + MLA(hi, lo, (*fo)[4], ptr[11]);
  2685. + MLA(hi, lo, (*fo)[3], ptr[10]);
  2686. + MLA(hi, lo, (*fo)[2], ptr[9]);
  2687. + MLA(hi, lo, (*fo)[1], ptr[8]);
  2688. + MLA(hi, lo, (*fo)[0], ptr[7]); */
  2689. + window_2 r8/*fo*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
  2690. +
  2691. +
  2692. + /* *pcm2-- = SHIFT(MLZ(hi, lo)); */
  2693. + lddsp r3, sp[4]
  2694. + lddsp r2, sp[0]
  2695. + scale r1, r0, r1
  2696. + st.w --r11/*pcm_2*/, r1
  2697. +
  2698. + /* ptr += 2*pe; */
  2699. + add r12/*ptr*/, r12, r3/*pe*/<< 2
  2700. +
  2701. + /* ++fo;
  2702. + } */
  2703. + sub r8/*fo*/, -8*4
  2704. +
  2705. + sub r2, 1
  2706. + stdsp sp[0], r2
  2707. + brne odd_loop
  2708. +
  2709. + /* ptr += 33; */
  2710. + sub r12/*ptr*/, -33*2
  2711. +
  2712. + /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
  2713. + MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
  2714. + MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
  2715. + MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
  2716. + MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
  2717. + MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
  2718. + MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
  2719. + MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
  2720. + window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
  2721. +
  2722. + rjmp synth_end
  2723. +synth_even:
  2724. + /* Filter for even phases */
  2725. +
  2726. + /* fe = &(*filter)[0][0][0];
  2727. + fx = &(*filter)[0][1][0];
  2728. + fo = &(*filter)[1][1][0]; */
  2729. + sub lr /*fx*/, r11, -16*8*4
  2730. + sub r8 /*fo*/, r11, -(16*8*4*2 + 16*8*4)
  2731. +
  2732. + /* po = ((phase - 1) & 0xF) >> 1; */
  2733. + sub r12, 1
  2734. + andl r12, 0xe, COH
  2735. + stdsp sp[4], r12
  2736. + /* ptr = (short const *)Dmod + po; */
  2737. + add r12, r9, r12
  2738. +
  2739. + /* ML0(hi, lo, (*fx)[0], ptr[0 + 17]);
  2740. + MLA(hi, lo, (*fx)[1], ptr[7 + 17]);
  2741. + MLA(hi, lo, (*fx)[2], ptr[6 + 17]);
  2742. + MLA(hi, lo, (*fx)[3], ptr[5 + 17]);
  2743. + MLA(hi, lo, (*fx)[4], ptr[4 + 17]);
  2744. + MLA(hi, lo, (*fx)[5], ptr[3 + 17]);
  2745. + MLA(hi, lo, (*fx)[6], ptr[2 + 17]);
  2746. + MLA(hi, lo, (*fx)[7], ptr[1 + 17]); */
  2747. + window_1 lr/*fx*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
  2748. +
  2749. + /* MLN(hi, lo); */
  2750. + neg r0
  2751. + acr r1
  2752. + neg r1
  2753. +
  2754. + /* MLA(hi, lo, (*fe)[0], ptr[0 + 1]);
  2755. + MLA(hi, lo, (*fe)[1], ptr[7 + 1]);
  2756. + MLA(hi, lo, (*fe)[2], ptr[6 + 1]);
  2757. + MLA(hi, lo, (*fe)[3], ptr[5 + 1]);
  2758. + MLA(hi, lo, (*fe)[4], ptr[4 + 1]);
  2759. + MLA(hi, lo, (*fe)[5], ptr[3 + 1]);
  2760. + MLA(hi, lo, (*fe)[6], ptr[2 + 1]);
  2761. + MLA(hi, lo, (*fe)[7], ptr[1 + 1]); */
  2762. + window_1 r11/*fe*/,r12/*ptr*/,r0/*acc*/,1/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
  2763. +
  2764. + /* *pcm1++ = SHIFT(MLZ(hi, lo));
  2765. +
  2766. + pcm2 = pcm1 + 31; */
  2767. + scale r1, r0, r1
  2768. + st.w r10/*pcm_1*/++, r1
  2769. + sub lr/*pcm2*/, r10, -4*31
  2770. +
  2771. + /* for (sb = 1; sb < 16; ++sb) { */
  2772. + mov r2, 15
  2773. + stdsp sp[0], r2
  2774. +even_loop:
  2775. + /* ++fe;
  2776. + ptr += 33; */
  2777. + sub r11 /*fe*/, -8*4
  2778. + sub r12, -33*2
  2779. +
  2780. + /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
  2781. + MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
  2782. + MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
  2783. + MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
  2784. + MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
  2785. + MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
  2786. + MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
  2787. + MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
  2788. + window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
  2789. + /* MLN(hi, lo); */
  2790. + neg r0
  2791. + acr r1
  2792. + neg r1
  2793. +
  2794. + /* MLA(hi, lo, (*fe)[7], ptr[1 + 1]);
  2795. + MLA(hi, lo, (*fe)[6], ptr[2 + 1]);
  2796. + MLA(hi, lo, (*fe)[5], ptr[3 + 1]);
  2797. + MLA(hi, lo, (*fe)[4], ptr[4 + 1]);
  2798. + MLA(hi, lo, (*fe)[3], ptr[5 + 1]);
  2799. + MLA(hi, lo, (*fe)[2], ptr[6 + 1]);
  2800. + MLA(hi, lo, (*fe)[1], ptr[7 + 1]);
  2801. + MLA(hi, lo, (*fe)[0], ptr[0 + 1]); */
  2802. + window_1 r11/*fe*/,r12/*ptr*/,r0/*acc*/,1/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
  2803. +
  2804. + /* *pcm1++ = SHIFT(MLZ(hi, lo)); */
  2805. + lddsp r2, sp[4]
  2806. + scale r1, r0, r1
  2807. + /* ptr -= 2*po; */
  2808. + sub r12/*ptr*/, r12, r2/*po*/<< 1
  2809. + st.w r10/*pcm_1*/++, r1
  2810. +
  2811. +
  2812. + /* ML0(hi, lo, (*fe)[0], ptr[7 + 17 - 1]);
  2813. + MLA(hi, lo, (*fe)[1], ptr[8 + 17 - 1]);
  2814. + MLA(hi, lo, (*fe)[2], ptr[9 + 17 - 1]);
  2815. + MLA(hi, lo, (*fe)[3], ptr[10 + 17 - 1]);
  2816. + MLA(hi, lo, (*fe)[4], ptr[11 + 17 - 1]);
  2817. + MLA(hi, lo, (*fe)[5], ptr[12 + 17 - 1]);
  2818. + MLA(hi, lo, (*fe)[6], ptr[13 + 17 - 1]);
  2819. + MLA(hi, lo, (*fe)[7], ptr[14 + 17 - 1]); */
  2820. + window_2 r11/*fe*/,r12/*ptr*/,r0/*acc*/,16/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
  2821. + /* MLA(hi, lo, (*fo)[7], ptr[14]);
  2822. + MLA(hi, lo, (*fo)[6], ptr[13]);
  2823. + MLA(hi, lo, (*fo)[5], ptr[12]);
  2824. + MLA(hi, lo, (*fo)[4], ptr[11]);
  2825. + MLA(hi, lo, (*fo)[3], ptr[10]);
  2826. + MLA(hi, lo, (*fo)[2], ptr[9]);
  2827. + MLA(hi, lo, (*fo)[1], ptr[8]);
  2828. + MLA(hi, lo, (*fo)[0], ptr[7]); */
  2829. + window_2 r8/*fo*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
  2830. +
  2831. +
  2832. + /* *pcm2-- = SHIFT(MLZ(hi, lo)); */
  2833. + lddsp r3, sp[4]
  2834. + lddsp r2, sp[0]
  2835. + scale r1, r0, r1
  2836. + st.w --lr/*pcm_2*/, r1
  2837. +
  2838. + /* ptr += 2*po; */
  2839. + add r12/*ptr*/, r12, r3/*po*/<< 1
  2840. +
  2841. + /* ++fo;
  2842. + } */
  2843. + sub r8/*fo*/, -8*4
  2844. +
  2845. + sub r2, 1
  2846. + stdsp sp[0], r2
  2847. + brne even_loop
  2848. +
  2849. + /* ptr += 33; */
  2850. + sub r12/*ptr*/, -33*2
  2851. +
  2852. + /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
  2853. + MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
  2854. + MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
  2855. + MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
  2856. + MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
  2857. + MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
  2858. + MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
  2859. + MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
  2860. + window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
  2861. +
  2862. +
  2863. +
  2864. +synth_end:
  2865. + /* *pcm1 = SHIFT(-MLZ(hi, lo)); */
  2866. + scale r1, r0, r1
  2867. + neg r1
  2868. + st.w r10/*pcm_1*/, r1
  2869. +
  2870. + mov r12, r10
  2871. + sub sp, -8
  2872. + popm r0-r7, pc
  2873. +
  2874. +
  2875. +
  2876. +
  2877. +