mplayer-1.0rc1-atmel.3.patch 212 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444
  1. cfg-common.h | 4 +
  2. cfg-mencoder.h | 4 +
  3. cfg-mplayer.h | 4 +
  4. configure | 13 +-
  5. libaf/af_format.c | 7 +
  6. libavcodec/Makefile | 7 +
  7. libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++
  8. libavcodec/avr32/fdct.S | 541 ++++++++
  9. libavcodec/avr32/h264idct.S | 451 +++++++
  10. libavcodec/avr32/idct.S | 829 ++++++++++++
  11. libavcodec/avr32/mc.S | 434 ++++++
  12. libavcodec/avr32/pico.h | 260 ++++
  13. libavcodec/bitstream.h | 77 +-
  14. libavcodec/dsputil.c | 3 +
  15. libavcodec/h264.c | 15 +
  16. libavutil/common.h | 16 +
  17. libavutil/internal.h | 9 +
  18. libfaad2/common.h | 2 +-
  19. libmpcodecs/ad_libmad.c | 5 +
  20. libswscale/pico-avr32.h | 137 ++
  21. libswscale/swscale_internal.h | 2 +-
  22. libswscale/yuv2rgb.c | 14 +
  23. libswscale/yuv2rgb_avr32.c | 416 ++++++
  24. libvo/vo_fbdev2.c | 101 ++-
  25. version.sh | 2 +-
  26. 25 files changed, 6011 insertions(+), 20 deletions(-)
  27. create mode 100644 libavcodec/avr32/dsputil_avr32.c
  28. create mode 100644 libavcodec/avr32/fdct.S
  29. create mode 100644 libavcodec/avr32/h264idct.S
  30. create mode 100644 libavcodec/avr32/idct.S
  31. create mode 100644 libavcodec/avr32/mc.S
  32. create mode 100644 libavcodec/avr32/pico.h
  33. create mode 100644 libswscale/pico-avr32.h
  34. create mode 100644 libswscale/yuv2rgb_avr32.c
  35. diff --git a/cfg-common.h b/cfg-common.h
  36. index 780df38..7d878a8 100644
  37. --- a/cfg-common.h
  38. +++ b/cfg-common.h
  39. @@ -235,6 +235,10 @@
  40. {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
  41. {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
  42. +#ifdef ARCH_AVR32
  43. + {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
  44. + {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
  45. +#endif
  46. // draw by slices or whole frame (useful with libmpeg2/libavcodec)
  47. {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
  48. {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
  49. diff --git a/cfg-mencoder.h b/cfg-mencoder.h
  50. index 411b748..addf791 100644
  51. --- a/cfg-mencoder.h
  52. +++ b/cfg-mencoder.h
  53. @@ -5,6 +5,10 @@
  54. #include "cfg-common.h"
  55. +#ifdef ARCH_AVR32
  56. +extern int avr32_use_pico;
  57. +#endif
  58. +
  59. #ifdef USE_FAKE_MONO
  60. extern int fakemono; // defined in dec_audio.c
  61. #endif
  62. diff --git a/cfg-mplayer.h b/cfg-mplayer.h
  63. index 62b6eac..31499c2 100644
  64. --- a/cfg-mplayer.h
  65. +++ b/cfg-mplayer.h
  66. @@ -4,6 +4,10 @@
  67. #include "cfg-common.h"
  68. +#ifdef ARCH_AVR32
  69. +extern int avr32_use_pico;
  70. +#endif
  71. +
  72. extern int noconsolecontrols;
  73. #if defined(HAVE_FBDEV)||defined(HAVE_VESA)
  74. diff --git a/configure b/configure
  75. index 29002c8..56c6fe4 100755
  76. --- a/configure
  77. +++ b/configure
  78. @@ -1203,6 +1203,15 @@ EOF
  79. _optimizing="$proc"
  80. ;;
  81. + avr32)
  82. + _def_arch='#define ARCH_AVR32'
  83. + _target_arch='TARGET_ARCH_AVR32 = yes'
  84. + iproc='avr32'
  85. + proc=''
  86. + _march=''
  87. + _mcpu=''
  88. + _optimizing=''
  89. + ;;
  90. arm|armv4l|armv5tel)
  91. _def_arch='#define ARCH_ARMV4L 1'
  92. _target_arch='TARGET_ARCH_ARMV4L = yes'
  93. @@ -1533,7 +1542,7 @@ echores $_named_asm_args
  94. # Checking for CFLAGS
  95. _stripbinaries=yes
  96. if test "$_profile" != "" || test "$_debug" != "" ; then
  97. - CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile"
  98. + CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile"
  99. if test "$_cc_major" -ge "3" ; then
  100. CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'`
  101. fi
  102. @@ -3794,7 +3803,7 @@ fi
  103. echocheck "X11 headers presence"
  104. - for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do
  105. + for I in `echo $_inc_extra | sed s/-I//g`; do
  106. if test -f "$I/X11/Xlib.h" ; then
  107. _inc_x11="-I$I"
  108. _x11_headers="yes"
  109. diff --git a/libaf/af_format.c b/libaf/af_format.c
  110. index e5b7cc9..5d7ea6d 100644
  111. --- a/libaf/af_format.c
  112. +++ b/libaf/af_format.c
  113. @@ -20,7 +20,14 @@
  114. // Integer to float conversion through lrintf()
  115. #ifdef HAVE_LRINTF
  116. #include <math.h>
  117. +
  118. +#ifdef ARCH_AVR32
  119. +#define lrintf(x) rint(x)
  120. +#define llrint(x) (long long)rint(x)
  121. +#else
  122. long int lrintf(float);
  123. +#endif
  124. +
  125. #else
  126. #define lrintf(x) ((int)(x))
  127. #endif
  128. diff --git a/libavcodec/Makefile b/libavcodec/Makefile
  129. index 17b6c45..8e1dc96 100644
  130. --- a/libavcodec/Makefile
  131. +++ b/libavcodec/Makefile
  132. @@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) += sparc/dsputil_vis.o \
  133. sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc
  134. +# avr32 specific stuff
  135. +ifeq ($(TARGET_ARCH_AVR32),yes)
  136. +ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o
  137. +OBJS += avr32/dsputil_avr32.o
  138. +endif
  139. +
  140. # sun mediaLib specific stuff
  141. OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \
  142. @@ -419,6 +425,7 @@ tests: apiexample $(TESTS)
  143. clean::
  144. rm -f \
  145. i386/*.o i386/*~ \
  146. + avr32/*.o avr32/*~ \
  147. armv4l/*.o armv4l/*~ \
  148. mlib/*.o mlib/*~ \
  149. alpha/*.o alpha/*~ \
  150. diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c
  151. new file mode 100644
  152. index 0000000..200284d
  153. --- /dev/null
  154. +++ b/libavcodec/avr32/dsputil_avr32.c
  155. @@ -0,0 +1,2678 @@
  156. +/*
  157. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  158. + *
  159. + * Redistribution and use in source and binary forms, with or without
  160. + * modification, are permitted provided that the following conditions
  161. + * are met:
  162. + *
  163. + * 1. Redistributions of source code must retain the above copyright
  164. + * notice, this list of conditions and the following disclaimer.
  165. + *
  166. + * 2. Redistributions in binary form must reproduce the above
  167. + * copyright notice, this list of conditions and the following
  168. + * disclaimer in the documentation and/or other materials provided
  169. + * with the distribution.
  170. + *
  171. + * 3. The name of ATMEL may not be used to endorse or promote products
  172. + * derived from this software without specific prior written
  173. + * permission.
  174. + *
  175. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  176. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  177. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  178. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  179. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  180. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  181. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  182. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  183. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  184. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  185. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  186. + * DAMAGE.
  187. + */
  188. +
  189. +#include "../dsputil.h"
  190. +#include "pico.h"
  191. +
  192. +int avr32_use_pico = 1;
  193. +
  194. +//#define CHECK_DSP_FUNCS_AGAINST_C
  195. +
  196. +#ifdef CHECK_DSP_FUNCS_AGAINST_C
  197. +#define DSP_FUNC_NAME(name) test_ ## name
  198. +#else
  199. +#define DSP_FUNC_NAME(name) name
  200. +#endif
  201. +
  202. +union doubleword {
  203. + int64_t doubleword;
  204. + struct {
  205. + int32_t top;
  206. + int32_t bottom;
  207. + } words;
  208. +};
  209. +
  210. +#undef LD16
  211. +#undef LD32
  212. +#undef LD64
  213. +
  214. +#define LD16(a) (*((uint16_t*)(a)))
  215. +#define LD32(a) (*((uint32_t*)(a)))
  216. +#define LD64(a) (*((uint64_t*)(a)))
  217. +#define LD64_UNALIGNED(a) \
  218. + ({ union doubleword __tmp__; \
  219. + __tmp__.words.top = LD32(a); \
  220. + __tmp__.words.bottom = LD32(a + 4); \
  221. + __tmp__.doubleword; })
  222. +
  223. +#undef ST32
  224. +#undef ST16
  225. +
  226. +#define ST16(a, b) *((uint16_t*)(a)) = (b)
  227. +#define ST32(a, b) *((uint32_t*)(a)) = (b)
  228. +
  229. +#undef rnd_avg32
  230. +#define rnd_avg32(a, b) \
  231. + ({ uint32_t __tmp__;\
  232. + asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
  233. + __tmp__;})
  234. +
  235. +void idct_avr32(DCTELEM *data);
  236. +void fdct_avr32(DCTELEM *data);
  237. +
  238. +void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
  239. +void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
  240. +
  241. +void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
  242. +void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
  243. +
  244. +#define extern_dspfunc(PFX, NUM) \
  245. + void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  246. + void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  247. + void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  248. + void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
  249. +
  250. +extern_dspfunc(put, 8);
  251. +extern_dspfunc(put_no_rnd, 8);
  252. +extern_dspfunc(avg, 8);
  253. +extern_dspfunc(avg_no_rnd, 8);
  254. +#undef extern_dspfunc
  255. +
  256. +#ifdef CHECK_DSP_FUNCS_AGAINST_C
  257. +#define extern_dspfunc(PFX, NUM) \
  258. + void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  259. + void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  260. + void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  261. + void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
  262. +
  263. +extern_dspfunc(put, 4);
  264. +extern_dspfunc(put_no_rnd, 4);
  265. +extern_dspfunc(put, 8);
  266. +extern_dspfunc(put_no_rnd, 8);
  267. +extern_dspfunc(put, 16);
  268. +extern_dspfunc(put_no_rnd, 16);
  269. +extern_dspfunc(avg, 8);
  270. +extern_dspfunc(avg_no_rnd, 8);
  271. +extern_dspfunc(avg, 16);
  272. +extern_dspfunc(avg_no_rnd, 16);
  273. +
  274. +
  275. +#undef extern_dspfunc
  276. +#define extern_dspfunc(PFX, NUM) \
  277. +void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \
  278. +void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \
  279. +void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \
  280. +void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \
  281. +void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \
  282. +void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \
  283. +void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \
  284. +void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \
  285. +void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \
  286. +void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \
  287. +void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \
  288. +void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \
  289. +void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \
  290. +void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \
  291. +void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \
  292. +void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \
  293. +
  294. +extern_dspfunc(put_h264_qpel, 16);
  295. +extern_dspfunc(put_h264_qpel, 8);
  296. +extern_dspfunc(put_h264_qpel, 4);
  297. +extern_dspfunc(avg_h264_qpel, 16);
  298. +extern_dspfunc(avg_h264_qpel, 8);
  299. +extern_dspfunc(avg_h264_qpel, 4);
  300. +
  301. +#undef extern_dspfunc
  302. +
  303. +void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  304. +void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  305. +void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  306. +
  307. +void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  308. +void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  309. +void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  310. +
  311. +
  312. +void dump_block8(uint8_t *block, int line_size, int h);
  313. +void dump_block4(uint8_t *block, int line_size, int h);
  314. +void dump_block(uint8_t *block, int line_size, int h, int w);
  315. +
  316. +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  317. + int h, char *name, int max_dev);
  318. +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  319. + int h, char *name, int max_dev);
  320. +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  321. + int h, int width, char *name, int max_dev);
  322. +
  323. +#define PIXOP2( OPNAME, OP ) \
  324. +void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  325. + int i;\
  326. + for(i=0; i<h; i++){\
  327. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  328. + pixels+=line_size;\
  329. + block +=line_size;\
  330. + }\
  331. +}\
  332. +void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  333. + int src_stride1, int src_stride2, int h){\
  334. + int i;\
  335. + for(i=0; i<h; i++){\
  336. + uint32_t a,b;\
  337. + a= LD32(&src1[i*src_stride1 ]);\
  338. + b= LD32(&src2[i*src_stride2 ]);\
  339. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  340. + a= LD32(&src1[i*src_stride1+4]);\
  341. + b= LD32(&src2[i*src_stride2+4]);\
  342. + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
  343. + }\
  344. +}\
  345. +\
  346. +void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  347. + int src_stride1, int src_stride2, int h){\
  348. + int i;\
  349. + for(i=0; i<h; i++){\
  350. + uint32_t a,b;\
  351. + a= LD32(&src1[i*src_stride1 ]);\
  352. + b= LD32(&src2[i*src_stride2 ]);\
  353. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  354. + }\
  355. +}\
  356. +\
  357. +void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  358. + int src_stride1, int src_stride2, int h){\
  359. + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  360. + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  361. +}\
  362. +
  363. +#else
  364. +#define PIXOP2( OPNAME, OP ) \
  365. +static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  366. + int i;\
  367. + for(i=0; i<h; i++){\
  368. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  369. + pixels+=line_size;\
  370. + block +=line_size;\
  371. + }\
  372. +}\
  373. +static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  374. + int i;\
  375. + for(i=0; i<h; i++){\
  376. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  377. + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
  378. + pixels+=line_size;\
  379. + block +=line_size;\
  380. + }\
  381. +}\
  382. +static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  383. + int i;\
  384. + for(i=0; i<h; i++){\
  385. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  386. + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
  387. + OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
  388. + OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
  389. + pixels+=line_size;\
  390. + block +=line_size;\
  391. + }\
  392. +}\
  393. +static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  394. + int src_stride1, int src_stride2, int h){\
  395. + int i;\
  396. + for(i=0; i<h; i++){\
  397. + uint32_t a,b;\
  398. + a= LD32(&src1[i*src_stride1 ]);\
  399. + b= LD32(&src2[i*src_stride2 ]);\
  400. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  401. + a= LD32(&src1[i*src_stride1+4]);\
  402. + b= LD32(&src2[i*src_stride2+4]);\
  403. + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
  404. + }\
  405. +}\
  406. +\
  407. +static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  408. + int src_stride1, int src_stride2, int h){\
  409. + int i;\
  410. + for(i=0; i<h; i++){\
  411. + uint32_t a,b;\
  412. + a= LD32(&src1[i*src_stride1 ]);\
  413. + b= LD32(&src2[i*src_stride2 ]);\
  414. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  415. + }\
  416. +}\
  417. +\
  418. +static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  419. + int src_stride1, int src_stride2, int h){\
  420. + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  421. + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  422. +}\
  423. +
  424. +#endif
  425. +
  426. +#define op_avg(a, b) a = rnd_avg32(a, b)
  427. +#define op_put(a, b) a = b
  428. +
  429. +PIXOP2(avg, op_avg)
  430. +PIXOP2(put, op_put)
  431. +#undef op_avg
  432. +#undef op_put
  433. +
  434. +
  435. +
  436. +static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
  437. +{
  438. + int i;
  439. + for(i=0; i<h; i++)
  440. + {
  441. + ST32(dst , LD32(src ));
  442. + dst+=dstStride;
  443. + src+=srcStride;
  444. + }
  445. +}
  446. +
  447. +static void clear_blocks_avr32(DCTELEM *blocks)
  448. +{
  449. + int n = 12;
  450. + uint64_t tmp1, tmp2;
  451. + blocks += 6*64;
  452. + asm volatile ( "mov\t%1, 0\n"
  453. + "mov\t%m1, 0\n"
  454. + "mov\t%2, 0\n"
  455. + "mov\t%m2, 0\n"
  456. + "0:\n"
  457. + "stm\t--%3, %1, %m1, %2, %m2\n"
  458. + "stm\t--%3, %1, %m1, %2, %m2\n"
  459. + "stm\t--%3, %1, %m1, %2, %m2\n"
  460. + "stm\t--%3, %1, %m1, %2, %m2\n"
  461. + "sub\t%0, 1\n"
  462. + "brne\t0b\n"
  463. + : "+r"(n), "=&r"(tmp1), "=&r"(tmp2),
  464. + "+r"(blocks));
  465. +}
  466. +
  467. +
  468. +static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
  469. +{
  470. + int i;
  471. + for(i=0; i<h; i++)
  472. + {
  473. + ST32(dst , LD32(src ));
  474. + ST32(dst+4 , LD32(src+4 ));
  475. + dst+=dstStride;
  476. + src+=srcStride;
  477. + }
  478. +}
  479. +
  480. +static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
  481. +{
  482. + int i;
  483. + for(i=0; i<h; i++)
  484. + {
  485. + ST32(dst , LD32(src ));
  486. + ST32(dst+4 , LD32(src+4 ));
  487. + ST32(dst+8 , LD32(src+8 ));
  488. + ST32(dst+12, LD32(src+12));
  489. + dst+=dstStride;
  490. + src+=srcStride;
  491. + }
  492. +}
  493. +
  494. +
  495. +static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  496. + const int A=(8-x)*(8-y);
  497. + const int B=( x)*(8-y);
  498. + const int C=(8-x)*( y);
  499. + const int D=( x)*( y);
  500. + int i;
  501. +
  502. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  503. + PICO_PUT_W(PICO_COEFF0_B, 32);
  504. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  505. + PICO_PUT_W(PICO_COEFF1_B, 0);
  506. + PICO_PUT_W(PICO_COEFF2_A, 0);
  507. + PICO_PUT_W(PICO_COEFF2_B, 0);
  508. + PICO_PUT_W(PICO_CONFIG,
  509. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  510. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  511. + | PICO_COEFF_FRAC_BITS(6)
  512. + | PICO_OFFSET_FRAC_BITS(6));
  513. +
  514. + for(i=0; i<h; i++)
  515. + {
  516. +
  517. + int src0 = LD32(src);
  518. + int src1 = LD32(src + stride);
  519. +
  520. + PICO_MVRC_W(PICO_INPIX0, src0);
  521. + PICO_MVRC_W(PICO_INPIX1, src1);
  522. + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
  523. + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
  524. + src += stride;
  525. + ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
  526. + dst += stride;
  527. + }
  528. +}
  529. +
  530. +
  531. +static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  532. + const int A=(8-x)*(8-y);\
  533. + const int B=( x)*(8-y);
  534. + const int C=(8-x)*( y);
  535. + const int D=( x)*( y);
  536. + int i;
  537. +
  538. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  539. + PICO_PUT_W(PICO_COEFF0_B, 32);
  540. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  541. + PICO_PUT_W(PICO_COEFF1_B, 0);
  542. + PICO_PUT_W(PICO_COEFF2_A, 0);
  543. + PICO_PUT_W(PICO_COEFF2_B, 0);
  544. + PICO_PUT_W(PICO_CONFIG,
  545. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  546. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  547. + | PICO_COEFF_FRAC_BITS(6)
  548. + | PICO_OFFSET_FRAC_BITS(6));
  549. +
  550. + for(i=0; i<h; i++)
  551. + {
  552. + /*
  553. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  554. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  555. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  556. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  557. + dst+= stride;
  558. + src+= stride;
  559. + */
  560. +
  561. + int src0 = LD32(src);
  562. + int src1 = (((int)src[4] << 24) | (int)src[stride]);
  563. + int src2 = LD32(src + stride + 1);
  564. +
  565. + PICO_MVRC_W(PICO_INPIX0, src0);
  566. + PICO_MVRC_W(PICO_INPIX1, src1);
  567. + PICO_MVRC_W(PICO_INPIX2, src2);
  568. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  569. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  570. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  571. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  572. + src += stride;
  573. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  574. +
  575. + dst += stride;
  576. + }
  577. +}
  578. +
  579. +static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  580. + const int A=(8-x)*(8-y);
  581. + const int B=( x)*(8-y);
  582. + const int C=(8-x)*( y);
  583. + const int D=( x)*( y);
  584. + int i;
  585. +
  586. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  587. + PICO_PUT_W(PICO_COEFF0_B, 32);
  588. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  589. + PICO_PUT_W(PICO_COEFF1_B, 0);
  590. + PICO_PUT_W(PICO_COEFF2_A, 0);
  591. + PICO_PUT_W(PICO_COEFF2_B, 0);
  592. + PICO_PUT_W(PICO_CONFIG,
  593. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  594. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  595. + | PICO_COEFF_FRAC_BITS(6)
  596. + | PICO_OFFSET_FRAC_BITS(6));
  597. +
  598. + for(i=0; i<h; i++)
  599. + {
  600. + /*
  601. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  602. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  603. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  604. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  605. + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
  606. + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
  607. + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
  608. + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
  609. + dst+= stride;
  610. + src+= stride;
  611. + */
  612. + int src0 = LD32(src);
  613. + int src1 = (((int)src[4] << 24) | (int)src[stride]);
  614. + int src2 = LD32(src + stride + 1);
  615. +
  616. + PICO_MVRC_W(PICO_INPIX0, src0);
  617. + PICO_MVRC_W(PICO_INPIX1, src1);
  618. + PICO_MVRC_W(PICO_INPIX2, src2);
  619. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  620. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  621. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  622. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  623. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  624. +
  625. + src0 = LD32(src + 4);
  626. + src1 = (src[8] << 24) | src[stride + 4];
  627. + src2 = LD32(src + stride + 5);
  628. +
  629. + PICO_MVRC_W(PICO_INPIX0, src0);
  630. + PICO_MVRC_W(PICO_INPIX1, src1);
  631. + PICO_MVRC_W(PICO_INPIX2, src2);
  632. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  633. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  634. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  635. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  636. + src += stride;
  637. + ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
  638. +
  639. + dst += stride;
  640. + }
  641. +}
  642. +
  643. +
  644. +static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  645. + const int A=(8-x)*(8-y);
  646. + const int B=( x)*(8-y);
  647. + const int C=(8-x)*( y);
  648. + const int D=( x)*( y);
  649. + int i;
  650. +
  651. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  652. + PICO_PUT_W(PICO_COEFF0_B, 32);
  653. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  654. + PICO_PUT_W(PICO_COEFF1_B, 0);
  655. + PICO_PUT_W(PICO_COEFF2_A, 0);
  656. + PICO_PUT_W(PICO_COEFF2_B, 0);
  657. + PICO_PUT_W(PICO_CONFIG,
  658. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  659. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  660. + | PICO_COEFF_FRAC_BITS(6)
  661. + | PICO_OFFSET_FRAC_BITS(6));
  662. +
  663. + for(i=0; i<h; i++)
  664. + {
  665. + int src0 = LD32(src);
  666. + int src1 = LD32(src + stride);
  667. +
  668. + PICO_MVRC_W(PICO_INPIX0, src0);
  669. + PICO_MVRC_W(PICO_INPIX1, src1);
  670. + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
  671. + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
  672. + src += stride;
  673. + ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
  674. + dst += stride;
  675. + }
  676. +}
  677. +
  678. +
  679. +static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  680. + const int A=(8-x)*(8-y);\
  681. + const int B=( x)*(8-y);
  682. + const int C=(8-x)*( y);
  683. + const int D=( x)*( y);
  684. + int i;
  685. +
  686. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  687. + PICO_PUT_W(PICO_COEFF0_B, 32);
  688. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  689. + PICO_PUT_W(PICO_COEFF1_B, 0);
  690. + PICO_PUT_W(PICO_COEFF2_A, 0);
  691. + PICO_PUT_W(PICO_COEFF2_B, 0);
  692. + PICO_PUT_W(PICO_CONFIG,
  693. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  694. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  695. + | PICO_COEFF_FRAC_BITS(6)
  696. + | PICO_OFFSET_FRAC_BITS(6));
  697. +
  698. + for(i=0; i<h; i++)
  699. + {
  700. + /*
  701. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  702. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  703. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  704. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  705. + dst+= stride;
  706. + src+= stride;
  707. + */
  708. +
  709. + int src0 = *((int *)src);
  710. + int src1 = (int)((src[4] << 24) | src[stride]);
  711. + int src2 = *((int *)(src + stride + 1));
  712. +
  713. + PICO_MVRC_W(PICO_INPIX0, src0);
  714. + PICO_MVRC_W(PICO_INPIX1, src1);
  715. + PICO_MVRC_W(PICO_INPIX2, src2);
  716. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  717. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  718. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  719. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  720. + src += stride;
  721. + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
  722. + dst += stride;
  723. + }
  724. +}
  725. +
  726. +static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  727. + const int A=(8-x)*(8-y);
  728. + const int B=( x)*(8-y);
  729. + const int C=(8-x)*( y);
  730. + const int D=( x)*( y);
  731. + int i;
  732. +
  733. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  734. + PICO_PUT_W(PICO_COEFF0_B, 32);
  735. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  736. + PICO_PUT_W(PICO_COEFF1_B, 0);
  737. + PICO_PUT_W(PICO_COEFF2_A, 0);
  738. + PICO_PUT_W(PICO_COEFF2_B, 0);
  739. + PICO_PUT_W(PICO_CONFIG,
  740. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  741. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  742. + | PICO_COEFF_FRAC_BITS(6)
  743. + | PICO_OFFSET_FRAC_BITS(6));
  744. +
  745. + for(i=0; i<h; i++)
  746. + {
  747. + /*
  748. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  749. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  750. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  751. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  752. + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
  753. + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
  754. + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
  755. + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
  756. + dst+= stride;
  757. + src+= stride;
  758. + */
  759. + int src0 = *((int *)src);
  760. + int src1 = (volatile int)((src[4] << 24) | src[stride]);
  761. + int src2 = *((int *)(src + stride + 1));
  762. +
  763. + PICO_MVRC_W(PICO_INPIX0, src0);
  764. + PICO_MVRC_W(PICO_INPIX1, src1);
  765. + PICO_MVRC_W(PICO_INPIX2, src2);
  766. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  767. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  768. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  769. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  770. + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
  771. +
  772. + src0 = *((int *)(src + 4));
  773. + src1 = (int)((src[8] << 24) | src[stride + 4]);
  774. + src2 = *((int *)(src + stride + 5));
  775. +
  776. + PICO_MVRC_W(PICO_INPIX0, src0);
  777. + PICO_MVRC_W(PICO_INPIX1, src1);
  778. + PICO_MVRC_W(PICO_INPIX2, src2);
  779. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  780. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  781. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  782. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  783. + src += stride;
  784. + ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
  785. + dst += stride;
  786. + }
  787. +}
  788. +
  789. +static struct pico_config_t h264_qpel4_h_lowpass_config = {
  790. + .input_mode = PICO_HOR_FILTER_MODE,
  791. + .output_mode = PICO_PLANAR_MODE,
  792. + .coeff_frac_bits = 5,
  793. + .offset_frac_bits = 5,
  794. + .coeff0_0 = 1,
  795. + .coeff0_1 = -5,
  796. + .coeff0_2 = 20,
  797. + .coeff0_3 = 16,
  798. + .coeff1_0 = 20,
  799. + .coeff1_1 = -5,
  800. + .coeff1_2 = 1,
  801. + .coeff1_3 = 0,
  802. + .coeff2_0 = 0,
  803. + .coeff2_1 = 0,
  804. + .coeff2_2 = 0,
  805. + .coeff2_3 = 0
  806. +};
  807. +
  808. +
  809. +
  810. +static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  811. + const int h=4;
  812. + int i;
  813. +
  814. + set_pico_config(&h264_qpel4_h_lowpass_config);
  815. +
  816. + for(i=0; i<h; i++){
  817. +
  818. + /*
  819. + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
  820. + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
  821. + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
  822. + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
  823. + dst+=dstStride;\
  824. + src+=srcStride;\ */
  825. + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
  826. + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
  827. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  828. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  829. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  830. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  831. + src += srcStride;
  832. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  833. + dst += dstStride;
  834. + }
  835. +}
  836. +
  837. +static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  838. + const int h=4;
  839. + int i;
  840. +
  841. + set_pico_config(&h264_qpel4_h_lowpass_config);
  842. +
  843. + for(i=0; i<h; i++){
  844. +
  845. + /*
  846. + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
  847. + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
  848. + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
  849. + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
  850. + dst+=dstStride;\
  851. + src+=srcStride;\ */
  852. +
  853. + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
  854. + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
  855. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  856. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  857. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  858. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  859. + src += srcStride;
  860. + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
  861. + dst += dstStride;
  862. + }
  863. +}
  864. +
  865. +static struct pico_config_t h264_qpel4_v_lowpass_config1 = {
  866. + .input_mode = PICO_VERT_FILTER_MODE,
  867. + .output_mode = PICO_PACKED_MODE,
  868. + .coeff_frac_bits = 5,
  869. + .offset_frac_bits = 5,
  870. + .coeff0_0 = 1,
  871. + .coeff0_1 = -5,
  872. + .coeff0_2 = 20,
  873. + .coeff0_3 = 16,
  874. + .coeff1_0 = 1,
  875. + .coeff1_1 = -5,
  876. + .coeff1_2 = 20,
  877. + .coeff1_3 = 16,
  878. + .coeff2_0 = 1,
  879. + .coeff2_1 = -5,
  880. + .coeff2_2 = 20,
  881. + .coeff2_3 = 16
  882. +};
  883. +
  884. +
  885. +
  886. +static struct pico_config_t h264_qpel4_v_lowpass_config2 = {
  887. + .input_mode = PICO_VERT_FILTER_MODE,
  888. + .output_mode = PICO_PLANAR_MODE,
  889. + .coeff_frac_bits = 5,
  890. + .offset_frac_bits = 5,
  891. + .coeff0_0 = 1,
  892. + .coeff0_1 = -5,
  893. + .coeff0_2 = 20,
  894. + .coeff0_3 = 16,
  895. + .coeff1_0 = 20,
  896. + .coeff1_1 = -5,
  897. + .coeff1_2 = 1,
  898. + .coeff1_3 = 0,
  899. + .coeff2_0 = 0,
  900. + .coeff2_1 = 0,
  901. + .coeff2_2 = 0,
  902. + .coeff2_3 = 0
  903. +};
  904. +
  905. +static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  906. +
  907. + /*
  908. + const int w=4;
  909. + uint8_t *cm = cropTbl + MAX_NEG_CROP;
  910. + int i;
  911. + for(i=0; i<w; i++)
  912. + {
  913. + const int srcB= src[-2*srcStride];\
  914. + const int srcA= src[-1*srcStride];\
  915. + const int src0= src[0 *srcStride];\
  916. + const int src1= src[1 *srcStride];\
  917. + const int src2= src[2 *srcStride];\
  918. + const int src3= src[3 *srcStride];\
  919. + const int src4= src[4 *srcStride];\
  920. + const int src5= src[5 *srcStride];\
  921. + const int src6= src[6 *srcStride];\
  922. + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  923. + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  924. + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  925. + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  926. + dst++;\
  927. + src++;\
  928. + */
  929. +
  930. + set_pico_config(&h264_qpel4_v_lowpass_config1);
  931. +
  932. + {
  933. + int srcB= LD32(src - 2*srcStride);
  934. + int srcA= LD32(src - 1*srcStride);
  935. + int src0= LD32(src + 0 *srcStride);
  936. + int src1= LD32(src + 1 *srcStride);
  937. + int src2= LD32(src + 2 *srcStride);
  938. + int src3= LD32(src + 3 *srcStride);
  939. + int src4= LD32(src + 4 *srcStride);
  940. + int src5= LD32(src + 5 *srcStride);
  941. + int src6= LD32(src + 6 *srcStride);
  942. +
  943. + /* First compute the leftmost three colums */
  944. + PICO_MVRC_W(PICO_INPIX0, srcB);
  945. + PICO_MVRC_W(PICO_INPIX1, srcA);
  946. + PICO_MVRC_W(PICO_INPIX2, src0);
  947. + PICO_OP(0, 0, 0, 3, 6);
  948. + PICO_MVRC_W(PICO_INPIX2, src1);
  949. + PICO_MVRC_W(PICO_INPIX1, src2);
  950. + PICO_MVRC_W(PICO_INPIX0, src3);
  951. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  952. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  953. + dst += dstStride;
  954. + PICO_MVRC_W(PICO_INPIX0, srcA);
  955. + PICO_MVRC_W(PICO_INPIX1, src0);
  956. + PICO_MVRC_W(PICO_INPIX2, src1);
  957. + PICO_OP(0, 0, 0, 3, 6);
  958. + PICO_MVRC_W(PICO_INPIX2, src2);
  959. + PICO_MVRC_W(PICO_INPIX1, src3);
  960. + PICO_MVRC_W(PICO_INPIX0, src4);
  961. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  962. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  963. + dst += dstStride;
  964. + PICO_MVRC_W(PICO_INPIX0, src0);
  965. + PICO_MVRC_W(PICO_INPIX1, src1);
  966. + PICO_MVRC_W(PICO_INPIX2, src2);
  967. + PICO_OP(0, 0, 0, 3, 6);
  968. + PICO_MVRC_W(PICO_INPIX2, src3);
  969. + PICO_MVRC_W(PICO_INPIX1, src4);
  970. + PICO_MVRC_W(PICO_INPIX0, src5);
  971. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  972. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  973. + dst += dstStride;
  974. + PICO_MVRC_W(PICO_INPIX0, src1);
  975. + PICO_MVRC_W(PICO_INPIX1, src2);
  976. + PICO_MVRC_W(PICO_INPIX2, src3);
  977. + PICO_OP(0, 0, 0, 3, 6);
  978. + PICO_MVRC_W(PICO_INPIX2, src4);
  979. + PICO_MVRC_W(PICO_INPIX1, src5);
  980. + PICO_MVRC_W(PICO_INPIX0, src6);
  981. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  982. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  983. + /* Now compute the last column */
  984. +
  985. + union wordbytes {
  986. + int word;
  987. + struct {
  988. + unsigned int t:8;
  989. + unsigned int u:8;
  990. + unsigned int l:8;
  991. + unsigned int b:8;
  992. + } bytes; } tmp1, tmp2, tmp3;
  993. +
  994. +
  995. + tmp1.bytes.t = srcB;
  996. + tmp1.bytes.u = src1;
  997. + tmp1.bytes.l = src4;
  998. +
  999. + tmp2.bytes.t = srcA;
  1000. + tmp2.bytes.u = src2;
  1001. + tmp2.bytes.l = src5;
  1002. +
  1003. + tmp3.bytes.t = src0;
  1004. + tmp3.bytes.u = src3;
  1005. + tmp3.bytes.l = src6;
  1006. +
  1007. + PICO_MVRC_W(PICO_INPIX0, tmp1.word);
  1008. + PICO_MVRC_W(PICO_INPIX1, tmp2.word);
  1009. + PICO_MVRC_W(PICO_INPIX2, tmp3.word);
  1010. + set_pico_config(&h264_qpel4_v_lowpass_config2);
  1011. +
  1012. +
  1013. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  1014. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  1015. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  1016. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  1017. +
  1018. + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
  1019. + dst[3] = (char)(tmp1.bytes.b);
  1020. + dst[3 - dstStride] = (char)(tmp1.bytes.l);
  1021. + dst[3 - 2*dstStride] = (char)(tmp1.bytes.u);
  1022. + dst[3 - 3*dstStride] = (char)(tmp1.bytes.t);
  1023. +
  1024. + }
  1025. + /*}
  1026. +
  1027. +
  1028. + }*/
  1029. +}
  1030. +
  1031. +static void avg_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1032. +
  1033. + /*
  1034. + const int w=4;
  1035. + uint8_t *cm = cropTbl + MAX_NEG_CROP;
  1036. + int i;
  1037. + for(i=0; i<w; i++)
  1038. + {
  1039. + const int srcB= src[-2*srcStride];\
  1040. + const int srcA= src[-1*srcStride];\
  1041. + const int src0= src[0 *srcStride];\
  1042. + const int src1= src[1 *srcStride];\
  1043. + const int src2= src[2 *srcStride];\
  1044. + const int src3= src[3 *srcStride];\
  1045. + const int src4= src[4 *srcStride];\
  1046. + const int src5= src[5 *srcStride];\
  1047. + const int src6= src[6 *srcStride];\
  1048. + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  1049. + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  1050. + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  1051. + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  1052. + dst++;\
  1053. + src++;\
  1054. + */
  1055. + uint8_t tmp_block[4*4];
  1056. +
  1057. + set_pico_config(&h264_qpel4_v_lowpass_config1);
  1058. +
  1059. + {
  1060. + int srcB= LD32(src - 2*srcStride);
  1061. + int srcA= LD32(src - 1*srcStride);
  1062. + int src0= LD32(src + 0 *srcStride);
  1063. + int src1= LD32(src + 1 *srcStride);
  1064. + int src2= LD32(src + 2 *srcStride);
  1065. + int src3= LD32(src + 3 *srcStride);
  1066. + int src4= LD32(src + 4 *srcStride);
  1067. + int src5= LD32(src + 5 *srcStride);
  1068. + int src6= LD32(src + 6 *srcStride);
  1069. +
  1070. + /* First compute the leftmost three colums */
  1071. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1072. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1073. + PICO_MVRC_W(PICO_INPIX2, src0);
  1074. + PICO_OP(0, 0, 0, 3, 6);
  1075. + PICO_MVRC_W(PICO_INPIX2, src1);
  1076. + PICO_MVRC_W(PICO_INPIX1, src2);
  1077. + PICO_MVRC_W(PICO_INPIX0, src3);
  1078. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  1079. + ST32(tmp_block, PICO_GET_W(PICO_OUTPIX0));
  1080. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1081. + PICO_MVRC_W(PICO_INPIX1, src0);
  1082. + PICO_MVRC_W(PICO_INPIX2, src1);
  1083. + PICO_OP(0, 0, 0, 3, 6);
  1084. + PICO_MVRC_W(PICO_INPIX2, src2);
  1085. + PICO_MVRC_W(PICO_INPIX1, src3);
  1086. + PICO_MVRC_W(PICO_INPIX0, src4);
  1087. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  1088. + ST32(tmp_block + 4, PICO_GET_W(PICO_OUTPIX0));
  1089. + PICO_MVRC_W(PICO_INPIX0, src0);
  1090. + PICO_MVRC_W(PICO_INPIX1, src1);
  1091. + PICO_MVRC_W(PICO_INPIX2, src2);
  1092. + PICO_OP(0, 0, 0, 3, 6);
  1093. + PICO_MVRC_W(PICO_INPIX2, src3);
  1094. + PICO_MVRC_W(PICO_INPIX1, src4);
  1095. + PICO_MVRC_W(PICO_INPIX0, src5);
  1096. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  1097. + ST32(tmp_block + 8, PICO_GET_W(PICO_OUTPIX0));
  1098. + PICO_MVRC_W(PICO_INPIX0, src1);
  1099. + PICO_MVRC_W(PICO_INPIX1, src2);
  1100. + PICO_MVRC_W(PICO_INPIX2, src3);
  1101. + PICO_OP(0, 0, 0, 3, 6);
  1102. + PICO_MVRC_W(PICO_INPIX2, src4);
  1103. + PICO_MVRC_W(PICO_INPIX1, src5);
  1104. + PICO_MVRC_W(PICO_INPIX0, src6);
  1105. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  1106. + ST32(tmp_block + 12, PICO_GET_W(PICO_OUTPIX0));
  1107. + /* Now compute the last column */
  1108. +
  1109. + union wordbytes {
  1110. + int word;
  1111. + struct {
  1112. + unsigned int t:8;
  1113. + unsigned int u:8;
  1114. + unsigned int l:8;
  1115. + unsigned int b:8;
  1116. + } bytes; } tmp1, tmp2, tmp3;
  1117. +
  1118. +
  1119. + tmp1.bytes.t = srcB;
  1120. + tmp1.bytes.u = src1;
  1121. + tmp1.bytes.l = src4;
  1122. +
  1123. + tmp2.bytes.t = srcA;
  1124. + tmp2.bytes.u = src2;
  1125. + tmp2.bytes.l = src5;
  1126. +
  1127. + tmp3.bytes.t = src0;
  1128. + tmp3.bytes.u = src3;
  1129. + tmp3.bytes.l = src6;
  1130. +
  1131. + PICO_MVRC_W(PICO_INPIX0, tmp1.word);
  1132. + PICO_MVRC_W(PICO_INPIX1, tmp2.word);
  1133. + PICO_MVRC_W(PICO_INPIX2, tmp3.word);
  1134. + set_pico_config(&h264_qpel4_v_lowpass_config2);
  1135. +
  1136. +
  1137. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  1138. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  1139. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  1140. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  1141. +
  1142. + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
  1143. + tmp_block[3 + 3*4] = (char)(tmp1.bytes.b);
  1144. + tmp_block[3 + 2*4] = (char)(tmp1.bytes.l);
  1145. + tmp_block[3 + 1*4] = (char)(tmp1.bytes.u);
  1146. + tmp_block[3] = (char)(tmp1.bytes.t);
  1147. +
  1148. + /* Compute the average */
  1149. + srcB= LD32(dst);
  1150. + srcA= LD32(dst + dstStride);
  1151. + src0= LD32(dst + dstStride*2);
  1152. + src1= LD32(dst + dstStride*3);
  1153. +
  1154. + src2= LD32(tmp_block);
  1155. + src3= LD32(tmp_block + 4);
  1156. + src4= LD32(tmp_block + 8);
  1157. + src5= LD32(tmp_block + 12);
  1158. +
  1159. + ST32(dst, rnd_avg32(srcB, src2));
  1160. + ST32(dst + dstStride, rnd_avg32(srcA, src3));
  1161. + ST32(dst + 2*dstStride, rnd_avg32(src0, src4));
  1162. + ST32(dst + 3*dstStride, rnd_avg32(src1, src5));
  1163. + }
  1164. +}
  1165. +
  1166. +static struct pico_config_t h264_qpel4_hv_lowpass_config = {
  1167. + .input_mode = PICO_HOR_FILTER_MODE,
  1168. + .output_mode = PICO_PACKED_MODE,
  1169. + .coeff_frac_bits = 10,
  1170. + .offset_frac_bits = 10,
  1171. + .coeff0_0 = 1,
  1172. + .coeff0_1 = -5,
  1173. + .coeff0_2 = 20,
  1174. + .coeff0_3 = 512,
  1175. + .coeff1_0 = -5,
  1176. + .coeff1_1 = 25,
  1177. + .coeff1_2 = -100,
  1178. + .coeff1_3 = 0,
  1179. + .coeff2_0 = 20,
  1180. + .coeff2_1 = -100,
  1181. + .coeff2_2 = 400,
  1182. + .coeff2_3 = 0
  1183. +};
  1184. +
  1185. +static void put_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1186. +
  1187. + int32_t tmp_block[48];
  1188. + int32_t *tmp = tmp_block;
  1189. + int i;
  1190. +
  1191. + set_pico_config(&h264_qpel4_hv_lowpass_config);
  1192. +
  1193. + src -= 2;
  1194. + for ( i = 0; i < 2; i++ ){
  1195. + int srcB= LD32(src - 2*srcStride);
  1196. + int srcA= LD32(src - 1*srcStride);
  1197. + int src0= LD32(src + 0 *srcStride);
  1198. + int src1= LD32(src + 1 *srcStride);
  1199. + int src2= LD32(src + 2 *srcStride);
  1200. + int src3= LD32(src + 3 *srcStride);
  1201. + int src4= LD32(src + 4 *srcStride);
  1202. + int src5= LD32(src + 5 *srcStride);
  1203. + int src6= LD32(src + 6 *srcStride);
  1204. +
  1205. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1206. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1207. + PICO_MVRC_W(PICO_INPIX2, src0);
  1208. + PICO_OP(0, 0, 0, 4, 8);
  1209. + PICO_MVRC_W(PICO_INPIX2, src1);
  1210. + PICO_MVRC_W(PICO_INPIX1, src2);
  1211. + PICO_MVRC_W(PICO_INPIX0, src3);
  1212. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1213. + PICO_STCM_W(tmp,
  1214. + PICO_REGVECT_VMU0_OUT,
  1215. + PICO_REGVECT_VMU1_OUT,
  1216. + PICO_REGVECT_VMU2_OUT);
  1217. + tmp += 3;
  1218. +
  1219. + PICO_OP(0, 0, 1, 5, 9);
  1220. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1221. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1222. + PICO_MVRC_W(PICO_INPIX2, src0);
  1223. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1224. + PICO_STCM_W(tmp,
  1225. + PICO_REGVECT_VMU0_OUT,
  1226. + PICO_REGVECT_VMU1_OUT,
  1227. + PICO_REGVECT_VMU2_OUT);
  1228. + tmp += 3;
  1229. +
  1230. + PICO_MVRC_W(PICO_INPIX0, src1);
  1231. + PICO_OP(0, 0, 4, 8, 0);
  1232. + PICO_MVRC_W(PICO_INPIX2, src2);
  1233. + PICO_MVRC_W(PICO_INPIX1, src3);
  1234. + PICO_MVRC_W(PICO_INPIX0, src4);
  1235. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1236. + PICO_STCM_W(tmp,
  1237. + PICO_REGVECT_VMU0_OUT,
  1238. + PICO_REGVECT_VMU1_OUT,
  1239. + PICO_REGVECT_VMU2_OUT);
  1240. + tmp += 3;
  1241. +
  1242. + PICO_OP(0, 0, 1, 5, 9);
  1243. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1244. + PICO_MVRC_W(PICO_INPIX1, src0);
  1245. + PICO_MVRC_W(PICO_INPIX2, src1);
  1246. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1247. + PICO_STCM_W(tmp,
  1248. + PICO_REGVECT_VMU0_OUT,
  1249. + PICO_REGVECT_VMU1_OUT,
  1250. + PICO_REGVECT_VMU2_OUT);
  1251. + tmp += 3;
  1252. +
  1253. + PICO_MVRC_W(PICO_INPIX0, src2);
  1254. + PICO_OP(0, 0, 4, 8, 0);
  1255. + PICO_MVRC_W(PICO_INPIX2, src3);
  1256. + PICO_MVRC_W(PICO_INPIX1, src4);
  1257. + PICO_MVRC_W(PICO_INPIX0, src5);
  1258. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1259. + PICO_STCM_W(tmp,
  1260. + PICO_REGVECT_VMU0_OUT,
  1261. + PICO_REGVECT_VMU1_OUT,
  1262. + PICO_REGVECT_VMU2_OUT);
  1263. + tmp += 3;
  1264. +
  1265. + PICO_OP(0, 0, 1, 5, 9);
  1266. + PICO_MVRC_W(PICO_INPIX0, src0);
  1267. + PICO_MVRC_W(PICO_INPIX1, src1);
  1268. + PICO_MVRC_W(PICO_INPIX2, src2);
  1269. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1270. + PICO_STCM_W(tmp,
  1271. + PICO_REGVECT_VMU0_OUT,
  1272. + PICO_REGVECT_VMU1_OUT,
  1273. + PICO_REGVECT_VMU2_OUT);
  1274. + tmp += 3;
  1275. +
  1276. + PICO_MVRC_W(PICO_INPIX0, src3);
  1277. + PICO_OP(0, 0, 4, 8, 0);
  1278. + PICO_MVRC_W(PICO_INPIX2, src4);
  1279. + PICO_MVRC_W(PICO_INPIX1, src5);
  1280. + PICO_MVRC_W(PICO_INPIX0, src6);
  1281. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1282. + PICO_STCM_W(tmp,
  1283. + PICO_REGVECT_VMU0_OUT,
  1284. + PICO_REGVECT_VMU1_OUT,
  1285. + PICO_REGVECT_VMU2_OUT);
  1286. + tmp += 3;
  1287. +
  1288. + PICO_OP(0, 0, 1, 5, 9);
  1289. + PICO_MVRC_W(PICO_INPIX0, src1);
  1290. + PICO_MVRC_W(PICO_INPIX1, src2);
  1291. + PICO_MVRC_W(PICO_INPIX2, src3);
  1292. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1293. + PICO_STCM_W(tmp,
  1294. + PICO_REGVECT_VMU0_OUT,
  1295. + PICO_REGVECT_VMU1_OUT,
  1296. + PICO_REGVECT_VMU2_OUT);
  1297. + tmp += 3;
  1298. + src += 2;
  1299. + }
  1300. +
  1301. + src -= 1;
  1302. + tmp -= 48;
  1303. +
  1304. +
  1305. + PICO_PUT_W(PICO_CONFIG,
  1306. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  1307. + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
  1308. + | PICO_COEFF_FRAC_BITS(10)
  1309. + | PICO_OFFSET_FRAC_BITS(10));
  1310. +
  1311. + for ( i = 0; i < 2; i++ ){
  1312. + int srcB= LD32(src - 2*srcStride);
  1313. + int srcA= LD32(src - 1*srcStride);
  1314. + int src0= LD32(src + 0 *srcStride);
  1315. + int src1= LD32(src + 1 *srcStride);
  1316. + int src2= LD32(src + 2 *srcStride);
  1317. + int src3= LD32(src + 3 *srcStride);
  1318. + int src4= LD32(src + 4 *srcStride);
  1319. + int src5= LD32(src + 5 *srcStride);
  1320. + int src6= LD32(src + 6 *srcStride);
  1321. +
  1322. +
  1323. + PICO_LDCM_W_INC(tmp,
  1324. + PICO_REGVECT_VMU0_OUT,
  1325. + PICO_REGVECT_VMU1_OUT,
  1326. + PICO_REGVECT_VMU2_OUT);
  1327. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1328. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1329. + PICO_MVRC_W(PICO_INPIX2, src0);
  1330. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1331. + PICO_MVRC_W(PICO_INPIX2, src1);
  1332. + PICO_MVRC_W(PICO_INPIX1, src2);
  1333. + PICO_MVRC_W(PICO_INPIX0, src3);
  1334. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1335. +
  1336. + PICO_LDCM_W_INC(tmp,
  1337. + PICO_REGVECT_VMU0_OUT,
  1338. + PICO_REGVECT_VMU1_OUT,
  1339. + PICO_REGVECT_VMU2_OUT);
  1340. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1341. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1342. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1343. + PICO_MVRC_W(PICO_INPIX2, src0);
  1344. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1345. +
  1346. + PICO_LDCM_W_INC(tmp,
  1347. + PICO_REGVECT_VMU0_OUT,
  1348. + PICO_REGVECT_VMU1_OUT,
  1349. + PICO_REGVECT_VMU2_OUT);
  1350. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1351. + PICO_MVRC_W(PICO_INPIX1, src0);
  1352. + PICO_MVRC_W(PICO_INPIX2, src1);
  1353. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1354. + PICO_MVRC_W(PICO_INPIX2, src2);
  1355. + PICO_MVRC_W(PICO_INPIX1, src3);
  1356. + PICO_MVRC_W(PICO_INPIX0, src4);
  1357. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1358. +
  1359. + PICO_LDCM_W_INC(tmp,
  1360. + PICO_REGVECT_VMU0_OUT,
  1361. + PICO_REGVECT_VMU1_OUT,
  1362. + PICO_REGVECT_VMU2_OUT);
  1363. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1364. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1365. + PICO_MVRC_W(PICO_INPIX1, src0);
  1366. + PICO_MVRC_W(PICO_INPIX2, src1);
  1367. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1368. +
  1369. + ST16(dst + 0*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
  1370. + ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
  1371. +
  1372. +
  1373. + PICO_LDCM_W_INC(tmp,
  1374. + PICO_REGVECT_VMU0_OUT,
  1375. + PICO_REGVECT_VMU1_OUT,
  1376. + PICO_REGVECT_VMU2_OUT);
  1377. + PICO_MVRC_W(PICO_INPIX0, src0);
  1378. + PICO_MVRC_W(PICO_INPIX1, src1);
  1379. + PICO_MVRC_W(PICO_INPIX2, src2);
  1380. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1381. + PICO_MVRC_W(PICO_INPIX2, src3);
  1382. + PICO_MVRC_W(PICO_INPIX1, src4);
  1383. + PICO_MVRC_W(PICO_INPIX0, src5);
  1384. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1385. +
  1386. + PICO_LDCM_W_INC(tmp,
  1387. + PICO_REGVECT_VMU0_OUT,
  1388. + PICO_REGVECT_VMU1_OUT,
  1389. + PICO_REGVECT_VMU2_OUT);
  1390. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1391. + PICO_MVRC_W(PICO_INPIX0, src0);
  1392. + PICO_MVRC_W(PICO_INPIX1, src1);
  1393. + PICO_MVRC_W(PICO_INPIX2, src2);
  1394. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1395. +
  1396. + PICO_LDCM_W_INC(tmp,
  1397. + PICO_REGVECT_VMU0_OUT,
  1398. + PICO_REGVECT_VMU1_OUT,
  1399. + PICO_REGVECT_VMU2_OUT);
  1400. + PICO_MVRC_W(PICO_INPIX0, src1);
  1401. + PICO_MVRC_W(PICO_INPIX1, src2);
  1402. + PICO_MVRC_W(PICO_INPIX2, src3);
  1403. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1404. + PICO_MVRC_W(PICO_INPIX2, src4);
  1405. + PICO_MVRC_W(PICO_INPIX1, src5);
  1406. + PICO_MVRC_W(PICO_INPIX0, src6);
  1407. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1408. +
  1409. + PICO_LDCM_W_INC(tmp,
  1410. + PICO_REGVECT_VMU0_OUT,
  1411. + PICO_REGVECT_VMU1_OUT,
  1412. + PICO_REGVECT_VMU2_OUT);
  1413. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1414. + PICO_MVRC_W(PICO_INPIX0, src1);
  1415. + PICO_MVRC_W(PICO_INPIX1, src2);
  1416. + PICO_MVRC_W(PICO_INPIX2, src3);
  1417. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1418. +
  1419. + ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
  1420. + ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
  1421. +
  1422. + dst += 2;
  1423. + src += 2;
  1424. + }
  1425. +}
  1426. +
  1427. +
  1428. +
  1429. +
  1430. +static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1431. +
  1432. + int32_t tmp_block[48];
  1433. + int32_t *tmp = tmp_block;
  1434. + int i;
  1435. +
  1436. + set_pico_config(&h264_qpel4_hv_lowpass_config);
  1437. +
  1438. + src -= 2;
  1439. + for ( i = 0; i < 2; i++ ){
  1440. + int srcB= LD32(src - 2*srcStride);
  1441. + int srcA= LD32(src - 1*srcStride);
  1442. + int src0= LD32(src + 0 *srcStride);
  1443. + int src1= LD32(src + 1 *srcStride);
  1444. + int src2= LD32(src + 2 *srcStride);
  1445. + int src3= LD32(src + 3 *srcStride);
  1446. + int src4= LD32(src + 4 *srcStride);
  1447. + int src5= LD32(src + 5 *srcStride);
  1448. + int src6= LD32(src + 6 *srcStride);
  1449. +
  1450. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1451. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1452. + PICO_MVRC_W(PICO_INPIX2, src0);
  1453. + PICO_OP(0, 0, 0, 4, 8);
  1454. + PICO_MVRC_W(PICO_INPIX2, src1);
  1455. + PICO_MVRC_W(PICO_INPIX1, src2);
  1456. + PICO_MVRC_W(PICO_INPIX0, src3);
  1457. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1458. + PICO_STCM_W(tmp,
  1459. + PICO_REGVECT_VMU0_OUT,
  1460. + PICO_REGVECT_VMU1_OUT,
  1461. + PICO_REGVECT_VMU2_OUT);
  1462. + tmp += 3;
  1463. +
  1464. + PICO_OP(0, 0, 1, 5, 9);
  1465. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1466. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1467. + PICO_MVRC_W(PICO_INPIX2, src0);
  1468. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1469. + PICO_STCM_W(tmp,
  1470. + PICO_REGVECT_VMU0_OUT,
  1471. + PICO_REGVECT_VMU1_OUT,
  1472. + PICO_REGVECT_VMU2_OUT);
  1473. + tmp += 3;
  1474. +
  1475. + PICO_MVRC_W(PICO_INPIX0, src1);
  1476. + PICO_OP(0, 0, 4, 8, 0);
  1477. + PICO_MVRC_W(PICO_INPIX2, src2);
  1478. + PICO_MVRC_W(PICO_INPIX1, src3);
  1479. + PICO_MVRC_W(PICO_INPIX0, src4);
  1480. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1481. + PICO_STCM_W(tmp,
  1482. + PICO_REGVECT_VMU0_OUT,
  1483. + PICO_REGVECT_VMU1_OUT,
  1484. + PICO_REGVECT_VMU2_OUT);
  1485. + tmp += 3;
  1486. +
  1487. + PICO_OP(0, 0, 1, 5, 9);
  1488. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1489. + PICO_MVRC_W(PICO_INPIX1, src0);
  1490. + PICO_MVRC_W(PICO_INPIX2, src1);
  1491. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1492. + PICO_STCM_W(tmp,
  1493. + PICO_REGVECT_VMU0_OUT,
  1494. + PICO_REGVECT_VMU1_OUT,
  1495. + PICO_REGVECT_VMU2_OUT);
  1496. + tmp += 3;
  1497. +
  1498. + PICO_MVRC_W(PICO_INPIX0, src2);
  1499. + PICO_OP(0, 0, 4, 8, 0);
  1500. + PICO_MVRC_W(PICO_INPIX2, src3);
  1501. + PICO_MVRC_W(PICO_INPIX1, src4);
  1502. + PICO_MVRC_W(PICO_INPIX0, src5);
  1503. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1504. + PICO_STCM_W(tmp,
  1505. + PICO_REGVECT_VMU0_OUT,
  1506. + PICO_REGVECT_VMU1_OUT,
  1507. + PICO_REGVECT_VMU2_OUT);
  1508. + tmp += 3;
  1509. +
  1510. + PICO_OP(0, 0, 1, 5, 9);
  1511. + PICO_MVRC_W(PICO_INPIX0, src0);
  1512. + PICO_MVRC_W(PICO_INPIX1, src1);
  1513. + PICO_MVRC_W(PICO_INPIX2, src2);
  1514. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1515. + PICO_STCM_W(tmp,
  1516. + PICO_REGVECT_VMU0_OUT,
  1517. + PICO_REGVECT_VMU1_OUT,
  1518. + PICO_REGVECT_VMU2_OUT);
  1519. + tmp += 3;
  1520. +
  1521. + PICO_MVRC_W(PICO_INPIX0, src3);
  1522. + PICO_OP(0, 0, 4, 8, 0);
  1523. + PICO_MVRC_W(PICO_INPIX2, src4);
  1524. + PICO_MVRC_W(PICO_INPIX1, src5);
  1525. + PICO_MVRC_W(PICO_INPIX0, src6);
  1526. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1527. + PICO_STCM_W(tmp,
  1528. + PICO_REGVECT_VMU0_OUT,
  1529. + PICO_REGVECT_VMU1_OUT,
  1530. + PICO_REGVECT_VMU2_OUT);
  1531. + tmp += 3;
  1532. +
  1533. + PICO_OP(0, 0, 1, 5, 9);
  1534. + PICO_MVRC_W(PICO_INPIX0, src1);
  1535. + PICO_MVRC_W(PICO_INPIX1, src2);
  1536. + PICO_MVRC_W(PICO_INPIX2, src3);
  1537. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1538. + PICO_STCM_W(tmp,
  1539. + PICO_REGVECT_VMU0_OUT,
  1540. + PICO_REGVECT_VMU1_OUT,
  1541. + PICO_REGVECT_VMU2_OUT);
  1542. + tmp += 3;
  1543. + src += 2;
  1544. + }
  1545. +
  1546. + src -= 1;
  1547. + tmp -= 48;
  1548. +
  1549. +
  1550. + PICO_PUT_W(PICO_CONFIG,
  1551. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  1552. + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
  1553. + | PICO_COEFF_FRAC_BITS(10)
  1554. + | PICO_OFFSET_FRAC_BITS(10));
  1555. +
  1556. + for ( i = 0; i < 2; i++ ){
  1557. + int srcB= LD32(src - 2*srcStride);
  1558. + int srcA= LD32(src - 1*srcStride);
  1559. + int src0= LD32(src + 0 *srcStride);
  1560. + int src1= LD32(src + 1 *srcStride);
  1561. + int src2= LD32(src + 2 *srcStride);
  1562. + int src3= LD32(src + 3 *srcStride);
  1563. + int src4= LD32(src + 4 *srcStride);
  1564. + int src5= LD32(src + 5 *srcStride);
  1565. + int src6= LD32(src + 6 *srcStride);
  1566. +
  1567. + PICO_LDCM_W_INC(tmp,
  1568. + PICO_REGVECT_VMU0_OUT,
  1569. + PICO_REGVECT_VMU1_OUT,
  1570. + PICO_REGVECT_VMU2_OUT);
  1571. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1572. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1573. + PICO_MVRC_W(PICO_INPIX2, src0);
  1574. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1575. + PICO_MVRC_W(PICO_INPIX2, src1);
  1576. + PICO_MVRC_W(PICO_INPIX1, src2);
  1577. + PICO_MVRC_W(PICO_INPIX0, src3);
  1578. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1579. +
  1580. + PICO_LDCM_W_INC(tmp,
  1581. + PICO_REGVECT_VMU0_OUT,
  1582. + PICO_REGVECT_VMU1_OUT,
  1583. + PICO_REGVECT_VMU2_OUT);
  1584. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1585. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1586. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1587. + PICO_MVRC_W(PICO_INPIX2, src0);
  1588. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1589. +
  1590. + PICO_LDCM_W_INC(tmp,
  1591. + PICO_REGVECT_VMU0_OUT,
  1592. + PICO_REGVECT_VMU1_OUT,
  1593. + PICO_REGVECT_VMU2_OUT);
  1594. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1595. + PICO_MVRC_W(PICO_INPIX1, src0);
  1596. + PICO_MVRC_W(PICO_INPIX2, src1);
  1597. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1598. + PICO_MVRC_W(PICO_INPIX2, src2);
  1599. + PICO_MVRC_W(PICO_INPIX1, src3);
  1600. + PICO_MVRC_W(PICO_INPIX0, src4);
  1601. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1602. +
  1603. + PICO_LDCM_W_INC(tmp,
  1604. + PICO_REGVECT_VMU0_OUT,
  1605. + PICO_REGVECT_VMU1_OUT,
  1606. + PICO_REGVECT_VMU2_OUT);
  1607. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1608. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1609. + PICO_MVRC_W(PICO_INPIX1, src0);
  1610. + PICO_MVRC_W(PICO_INPIX2, src1);
  1611. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1612. +
  1613. + ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
  1614. + ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0)));
  1615. +
  1616. +
  1617. + PICO_LDCM_W_INC(tmp,
  1618. + PICO_REGVECT_VMU0_OUT,
  1619. + PICO_REGVECT_VMU1_OUT,
  1620. + PICO_REGVECT_VMU2_OUT);
  1621. + PICO_MVRC_W(PICO_INPIX0, src0);
  1622. + PICO_MVRC_W(PICO_INPIX1, src1);
  1623. + PICO_MVRC_W(PICO_INPIX2, src2);
  1624. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1625. + PICO_MVRC_W(PICO_INPIX2, src3);
  1626. + PICO_MVRC_W(PICO_INPIX1, src4);
  1627. + PICO_MVRC_W(PICO_INPIX0, src5);
  1628. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1629. +
  1630. + PICO_LDCM_W_INC(tmp,
  1631. + PICO_REGVECT_VMU0_OUT,
  1632. + PICO_REGVECT_VMU1_OUT,
  1633. + PICO_REGVECT_VMU2_OUT);
  1634. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1635. + PICO_MVRC_W(PICO_INPIX0, src0);
  1636. + PICO_MVRC_W(PICO_INPIX1, src1);
  1637. + PICO_MVRC_W(PICO_INPIX2, src2);
  1638. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1639. +
  1640. + PICO_LDCM_W_INC(tmp,
  1641. + PICO_REGVECT_VMU0_OUT,
  1642. + PICO_REGVECT_VMU1_OUT,
  1643. + PICO_REGVECT_VMU2_OUT);
  1644. + PICO_MVRC_W(PICO_INPIX0, src1);
  1645. + PICO_MVRC_W(PICO_INPIX1, src2);
  1646. + PICO_MVRC_W(PICO_INPIX2, src3);
  1647. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1648. + PICO_MVRC_W(PICO_INPIX2, src4);
  1649. + PICO_MVRC_W(PICO_INPIX1, src5);
  1650. + PICO_MVRC_W(PICO_INPIX0, src6);
  1651. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1652. +
  1653. + PICO_LDCM_W_INC(tmp,
  1654. + PICO_REGVECT_VMU0_OUT,
  1655. + PICO_REGVECT_VMU1_OUT,
  1656. + PICO_REGVECT_VMU2_OUT);
  1657. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1658. + PICO_MVRC_W(PICO_INPIX0, src1);
  1659. + PICO_MVRC_W(PICO_INPIX1, src2);
  1660. + PICO_MVRC_W(PICO_INPIX2, src3);
  1661. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1662. +
  1663. + ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
  1664. + ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0)));
  1665. +
  1666. + dst += 2;
  1667. + src += 2;
  1668. + }
  1669. +}
  1670. +
  1671. +
  1672. +static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1673. + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1674. + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1675. + src += 4*srcStride;
  1676. + dst += 4*dstStride;
  1677. + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1678. + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1679. +}
  1680. +
  1681. +static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1682. + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1683. + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1684. + src += 4*srcStride;
  1685. + dst += 4*dstStride;
  1686. + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1687. + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1688. +}
  1689. +
  1690. +static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1691. + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1692. + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1693. + src += 4*srcStride;
  1694. + dst += 4*dstStride;
  1695. + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1696. + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1697. +}
  1698. +
  1699. +static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1700. + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1701. + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1702. + src += 4*srcStride;
  1703. + dst += 4*dstStride;
  1704. + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1705. + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1706. +}
  1707. +
  1708. +static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1709. + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1710. + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1711. + src += 4*srcStride;
  1712. + dst += 4*dstStride;
  1713. + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1714. + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1715. +}
  1716. +
  1717. +static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1718. + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1719. + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1720. + src += 4*srcStride;
  1721. + dst += 4*dstStride;
  1722. + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1723. + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1724. +}
  1725. +
  1726. +static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1727. + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1728. + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1729. + src += 8*srcStride;
  1730. + dst += 8*dstStride;
  1731. + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1732. + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1733. +}
  1734. +
  1735. +static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1736. + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1737. + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1738. + src += 8*srcStride;
  1739. + dst += 8*dstStride;
  1740. + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1741. + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1742. +}
  1743. +
  1744. +static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1745. + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1746. + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1747. + src += 8*srcStride;
  1748. + dst += 8*dstStride;
  1749. + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1750. + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1751. +}
  1752. +
  1753. +static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1754. + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1755. + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1756. + src += 8*srcStride;
  1757. + dst += 8*dstStride;
  1758. + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1759. + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1760. +}
  1761. +
  1762. +static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1763. + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1764. + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1765. + src += 8*srcStride;
  1766. + dst += 8*dstStride;
  1767. + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1768. + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1769. +}
  1770. +
  1771. +static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1772. + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1773. + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1774. + src += 8*srcStride;
  1775. + dst += 8*dstStride;
  1776. + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1777. + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1778. +}
  1779. +
  1780. +
  1781. +#define H264_MC(OPNAME, SIZE) \
  1782. +static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\
  1783. + OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
  1784. +}\
  1785. +\
  1786. +static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\
  1787. + uint8_t half[SIZE*SIZE];\
  1788. + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
  1789. + OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
  1790. +}\
  1791. +\
  1792. +static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\
  1793. + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\
  1794. +}\
  1795. +\
  1796. +static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\
  1797. + uint8_t half[SIZE*SIZE];\
  1798. + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
  1799. + OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
  1800. +}\
  1801. +\
  1802. +static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\
  1803. + uint8_t full[SIZE*(SIZE+5)];\
  1804. + uint8_t * const full_mid= full + SIZE*2;\
  1805. + uint8_t half[SIZE*SIZE];\
  1806. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1807. + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
  1808. + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
  1809. +}\
  1810. +\
  1811. +static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\
  1812. + uint8_t full[SIZE*(SIZE+5)];\
  1813. + uint8_t * const full_mid= full + SIZE*2;\
  1814. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1815. + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\
  1816. +}\
  1817. +\
  1818. +static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\
  1819. + uint8_t full[SIZE*(SIZE+5)];\
  1820. + uint8_t * const full_mid= full + SIZE*2;\
  1821. + uint8_t half[SIZE*SIZE];\
  1822. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1823. + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
  1824. + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
  1825. +}\
  1826. +\
  1827. +static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\
  1828. + uint8_t full[SIZE*(SIZE+5)];\
  1829. + uint8_t * const full_mid= full + SIZE*2;\
  1830. + uint8_t halfH[SIZE*SIZE];\
  1831. + uint8_t halfV[SIZE*SIZE];\
  1832. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
  1833. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1834. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1835. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1836. +}\
  1837. +\
  1838. +static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\
  1839. + uint8_t full[SIZE*(SIZE+5)];\
  1840. + uint8_t * const full_mid= full + SIZE*2;\
  1841. + uint8_t halfH[SIZE*SIZE];\
  1842. + uint8_t halfV[SIZE*SIZE];\
  1843. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
  1844. + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1845. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1846. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1847. +}\
  1848. +\
  1849. +static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\
  1850. + uint8_t full[SIZE*(SIZE+5)];\
  1851. + uint8_t * const full_mid= full + SIZE*2;\
  1852. + uint8_t halfH[SIZE*SIZE];\
  1853. + uint8_t halfV[SIZE*SIZE];\
  1854. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
  1855. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1856. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1857. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1858. +}\
  1859. +\
  1860. +static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\
  1861. + uint8_t full[SIZE*(SIZE+5)];\
  1862. + uint8_t * const full_mid= full + SIZE*2;\
  1863. + uint8_t halfH[SIZE*SIZE];\
  1864. + uint8_t halfV[SIZE*SIZE];\
  1865. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
  1866. + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1867. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1868. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1869. +}\
  1870. +\
  1871. +static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\
  1872. + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\
  1873. +}\
  1874. +\
  1875. +static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\
  1876. + uint8_t halfH[SIZE*SIZE];\
  1877. + uint8_t halfHV[SIZE*SIZE];\
  1878. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
  1879. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1880. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  1881. +}\
  1882. +\
  1883. +static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\
  1884. + uint8_t halfH[SIZE*SIZE];\
  1885. + uint8_t halfHV[SIZE*SIZE];\
  1886. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
  1887. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1888. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  1889. +}\
  1890. +\
  1891. +static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\
  1892. + uint8_t full[SIZE*(SIZE+5)];\
  1893. + uint8_t * const full_mid= full + SIZE*2;\
  1894. + uint8_t halfV[SIZE*SIZE];\
  1895. + uint8_t halfHV[SIZE*SIZE];\
  1896. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1897. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1898. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1899. + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  1900. +}\
  1901. +\
  1902. +static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\
  1903. + uint8_t full[SIZE*(SIZE+5)];\
  1904. + uint8_t * const full_mid= full + SIZE*2;\
  1905. + uint8_t halfV[SIZE*SIZE];\
  1906. + uint8_t halfHV[SIZE*SIZE];\
  1907. + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1908. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1909. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1910. + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  1911. +}\
  1912. +
  1913. +H264_MC(put_, 4)
  1914. +H264_MC(put_, 8)
  1915. +H264_MC(put_, 16)
  1916. +H264_MC(avg_, 4)
  1917. +H264_MC(avg_, 8)
  1918. +H264_MC(avg_, 16)
  1919. +
  1920. +
  1921. +
  1922. +#define dspfunc16(PFX) \
  1923. + void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1924. + PFX ## _pixels8_avr32(dst, pixels, line_size, h);\
  1925. + PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\
  1926. + }\
  1927. + void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1928. + PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\
  1929. + PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\
  1930. + }\
  1931. + void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1932. + PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\
  1933. + PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\
  1934. + }\
  1935. + void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1936. + PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\
  1937. + PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\
  1938. + }\
  1939. +
  1940. +
  1941. +dspfunc16(put)
  1942. +dspfunc16(put_no_rnd)
  1943. +dspfunc16(avg)
  1944. +dspfunc16(avg_no_rnd)
  1945. +#undef dspfunc16
  1946. +
  1947. +static int pix_sum_avr32(uint8_t * pix, int line_size)
  1948. +{
  1949. + int s, i;
  1950. +
  1951. + s = 0;
  1952. + for (i = 0; i < 16; i++) {
  1953. + int tmp1,tmp2,tmp3,tmp4,tmp5;
  1954. + __asm__ volatile ( "ld.w\t%0, %6[0]\n\t"
  1955. + "ld.w\t%1, %6[4]\n\t"
  1956. + "ld.w\t%2, %6[8]\n\t"
  1957. + "ld.w\t%3, %6[12]\n\t"
  1958. + "punpckub.h\t%4, %0:t\n\t"
  1959. + "padd.h\t%5, %5, %4\n\t"
  1960. + "punpckub.h\t%4, %0:b\n\t"
  1961. + "padd.h\t%5, %5, %4\n\t"
  1962. + "punpckub.h\t%4, %1:t\n\t"
  1963. + "padd.h\t%5, %5, %4\n\t"
  1964. + "punpckub.h\t%4, %1:b\n\t"
  1965. + "padd.h\t%5, %5, %4\n\t"
  1966. + "punpckub.h\t%4, %2:t\n\t"
  1967. + "padd.h\t%5, %5, %4\n\t"
  1968. + "punpckub.h\t%4, %2:b\n\t"
  1969. + "padd.h\t%5, %5, %4\n\t"
  1970. + "punpckub.h\t%4, %3:t\n\t"
  1971. + "padd.h\t%5, %5, %4\n\t"
  1972. + "punpckub.h\t%4, %3:b\n\t"
  1973. + "padd.h\t%5, %5, %4\n\t"
  1974. + : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s)
  1975. + : "r"(pix));
  1976. + pix += line_size;
  1977. + }
  1978. + __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) );
  1979. +
  1980. + return s;
  1981. +}
  1982. +
  1983. +
  1984. +//#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
  1985. +//#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
  1986. +//#define H264_WEIGHT(W,H) \
  1987. +//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
  1988. +// int attribute_unused x, y; \
  1989. +// offset <<= log2_denom; \
  1990. +// if(log2_denom) offset += 1<<(log2_denom-1); \
  1991. +// for(y=0; y<H; y++, block += stride){ \
  1992. +// uint32_t tmp0, tmp1;
  1993. +// if(W==2) { \
  1994. +// asm volatile ( "ld.ub\t%[tmp0], %[block][0]\n" \
  1995. +// "ld.ub\t%[tmp1], %[block][1]\n" \
  1996. +// "mulhh.w\t%[tmp0], %[tmp0]:b, %[weight]:b\n" \
  1997. +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
  1998. +// "asr\t%[tmp0], %[log2_denom]\n" \
  1999. +// "asr\t%[tmp1], %[log2_denom]\n" \
  2000. +// "satu\t%[tmp0] >> 0, 8\n" \
  2001. +// "satu\t%[tmp1] >> 0, 8\n" \
  2002. +// "st.b\t%[block][0], %[tmp0]\n" \
  2003. +// "st.b\t%[block][1], %[tmp1]\n" \
  2004. +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
  2005. +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
  2006. +// } else if ( W==4 ) { \
  2007. +// asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \
  2008. +// "punpckub.h\t%[tmp1], %[tmp0]:t\n" \
  2009. +// "punpckub.h\t%[tmp0], %[tmp0]:b\n" \
  2010. +// "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \
  2011. +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
  2012. +// "asr\t%[tmp0], %[log2_denom]\n" \
  2013. +// "asr\t%[tmp1], %[log2_denom]\n" \
  2014. +// "satu\t%[tmp0] >> 0, 8\n" \
  2015. +// "satu\t%[tmp1] >> 0, 8\n" \
  2016. +// "st.b\t%[block][0], %[tmp0]\n" \
  2017. +// "st.b\t%[block][1], %[tmp1]\n" \
  2018. +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
  2019. +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
  2020. +//
  2021. +//
  2022. +//
  2023. +// if(W==4) continue; \
  2024. +// op_scale1(4); \
  2025. +// op_scale1(5); \
  2026. +// op_scale1(6); \
  2027. +// op_scale1(7); \
  2028. +// if(W==8) continue; \
  2029. +// op_scale1(8); \
  2030. +// op_scale1(9); \
  2031. +// op_scale1(10); \
  2032. +// op_scale1(11); \
  2033. +// op_scale1(12); \
  2034. +// op_scale1(13); \
  2035. +// op_scale1(14); \
  2036. +// op_scale1(15); \
  2037. +// } \
  2038. +//} \
  2039. +//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
  2040. +// int attribute_unused x, y; \
  2041. +// int offset = (offsets + offsetd + 1) >> 1; \
  2042. +// offset = ((offset << 1) + 1) << log2_denom; \
  2043. +// for(y=0; y<H; y++, dst += stride, src += stride){ \
  2044. +// op_scale2(0); \
  2045. +// op_scale2(1); \
  2046. +// if(W==2) continue; \
  2047. +// op_scale2(2); \
  2048. +// op_scale2(3); \
  2049. +// if(W==4) continue; \
  2050. +// op_scale2(4); \
  2051. +// op_scale2(5); \
  2052. +// op_scale2(6); \
  2053. +// op_scale2(7); \
  2054. +// if(W==8) continue; \
  2055. +// op_scale2(8); \
  2056. +// op_scale2(9); \
  2057. +// op_scale2(10); \
  2058. +// op_scale2(11); \
  2059. +// op_scale2(12); \
  2060. +// op_scale2(13); \
  2061. +// op_scale2(14); \
  2062. +// op_scale2(15); \
  2063. +// } \
  2064. +//}
  2065. +
  2066. +
  2067. +
  2068. +/* Returns zero in each byte where the absolute difference between <a> and <b>
  2069. + is not less than <compare> */
  2070. +#define PABS_DIFF_LESS_THAN( a, b, compare) \
  2071. + ({ uint32_t __tmp__, __tmp2__, __mask__; \
  2072. + asm ( \
  2073. + /* Check ABS( a - b ) < compare */ \
  2074. + "psubs.ub\t%[tmp], %[opa], %[opb]\n" \
  2075. + "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \
  2076. + "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \
  2077. + /* This produces 0 for all bytes where the comparison is not true */ \
  2078. + "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \
  2079. + : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \
  2080. + : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \
  2081. + __mask__; })
  2082. +
  2083. +/*
  2084. + Set all bytes containing zero in <value> to 255 and the rest to zero.
  2085. +
  2086. + Add with saturation 254 to all bytes making all bytes different from
  2087. + zero become 255. Then add one without saturation to make all bytes
  2088. + originally containing zero 255 and the rest 0. */
  2089. +#define SET_ALL_BITS_IN_ZERO_BYTES(value) \
  2090. + ({ uint32_t __tmp__; \
  2091. + asm ( \
  2092. + "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \
  2093. + "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \
  2094. + : [tmp] "=r"(__tmp__) \
  2095. + : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \
  2096. + __tmp__; })
  2097. +
  2098. +#define PACKW_SH(upper, lower) \
  2099. + ({ uint32_t __tmp__; \
  2100. + asm ( \
  2101. + "packw.sh\t%[tmp], %[u], %[l]\n" \
  2102. + : [tmp] "=r"(__tmp__) \
  2103. + : [u] "r"(upper), [l] "r"(lower) ); \
  2104. + __tmp__; })
  2105. +
  2106. +#define PACKSH_UB(upper, lower) \
  2107. + ({ uint32_t __tmp__; \
  2108. + asm ( \
  2109. + "packsh.sb\t%[tmp], %[u], %[l]\n" \
  2110. + : [tmp] "=r"(__tmp__) \
  2111. + : [u] "r"(upper), [l] "r"(lower) ); \
  2112. + __tmp__; })
  2113. +
  2114. +static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  2115. +{
  2116. + int i;
  2117. +
  2118. + if ( alpha == 0 )
  2119. + return;
  2120. +
  2121. + alpha = PACKW_SH(alpha, alpha);
  2122. + alpha = PACKSH_UB(alpha, alpha);
  2123. + beta = PACKW_SH(beta, beta);
  2124. + beta = PACKSH_UB(beta, beta);
  2125. +
  2126. + for( i = 0; i < 4; i++ ) {
  2127. + uint32_t p0, p1, p2, q0, q1, q2;
  2128. + uint32_t mask, mask2;
  2129. + uint32_t tmp, tmp2, tmp3, tmp4;
  2130. +
  2131. + if( tc0[i] < 0 ) {
  2132. + pix += 4;
  2133. + continue;
  2134. + }
  2135. +
  2136. +/* for( d = 0; d < 4; d++ ) {
  2137. + const int p0 = pix[-1*stride];
  2138. + const int p1 = pix[-2*stride];
  2139. + const int p2 = pix[-3*stride];
  2140. + const int q0 = pix[0];
  2141. + const int q1 = pix[1*stride];
  2142. + const int q2 = pix[2*stride];
  2143. +
  2144. + if( ABS( p0 - q0 ) < alpha &&
  2145. + ABS( p1 - p0 ) < beta &&
  2146. + ABS( q1 - q0 ) < beta ) { */
  2147. +
  2148. + p0 = LD32(pix - stride);
  2149. + p1 = LD32(pix - 2*stride);
  2150. + q0 = LD32(pix);
  2151. + q1 = LD32(pix + stride);
  2152. +
  2153. + /* Check which of the columns should be filtered, if any. */
  2154. + mask = PABS_DIFF_LESS_THAN(p0, q0, alpha);
  2155. + mask |= PABS_DIFF_LESS_THAN(p1, p0, beta);
  2156. + mask |= PABS_DIFF_LESS_THAN(q1, q0, beta);
  2157. +
  2158. + if ( !mask )
  2159. + continue;
  2160. +
  2161. + mask = SET_ALL_BITS_IN_ZERO_BYTES(mask);
  2162. +
  2163. +
  2164. + int tc = PACKW_SH(tc0[i], tc0[i]);
  2165. + int tc0_p = tc;
  2166. + int tc0_m = PACKW_SH(-tc0[i], -tc0[i]);
  2167. +
  2168. + /*
  2169. + int i_delta;
  2170. + if( ABS( p2 - p0 ) < beta ) {
  2171. + pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
  2172. + tc++;
  2173. + }*/
  2174. +
  2175. + p2 = LD32(pix - 3*stride);
  2176. + mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask;
  2177. +
  2178. + if ( mask2 ){
  2179. + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
  2180. + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
  2181. + "paddh.ub\t%[tmp], %[tmp], %[p2]\n"
  2182. + "punpckub.h\t%[tmp2], %[tmp]:t\n"
  2183. + "punpckub.h\t%[tmp], %[tmp]:b\n"
  2184. + "punpckub.h\t%[tmp3], %[p1]:t\n"
  2185. + "punpckub.h\t%[tmp4], %[p1]:b\n"
  2186. + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2187. + "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
  2188. + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
  2189. + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
  2190. + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
  2191. + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
  2192. + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2193. + "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
  2194. + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
  2195. + "andn\t%[tmp], %[mask2]\n"
  2196. + "and\t%[tmp2], %[q1], %[mask2]\n"
  2197. + "or\t%[tmp], %[tmp2]\n"
  2198. + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
  2199. + [tmp4]"=&r"(tmp4)
  2200. + : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p),
  2201. + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
  2202. + ST32(pix - 2*stride, tmp);
  2203. + tc += 0x00010001;
  2204. + }
  2205. +
  2206. +
  2207. + q2 = LD32(pix + 2*stride);
  2208. +
  2209. + /*
  2210. + if( ABS( q2 - q0 ) < beta ) {
  2211. + pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
  2212. + tc++;
  2213. + }
  2214. + */
  2215. + mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask;
  2216. +
  2217. + if ( mask2 ){
  2218. + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
  2219. + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
  2220. + "paddh.ub\t%[tmp], %[tmp], %[q2]\n"
  2221. + "punpckub.h\t%[tmp2], %[tmp]:t\n"
  2222. + "punpckub.h\t%[tmp], %[tmp]:b\n"
  2223. + "punpckub.h\t%[tmp3], %[q1]:t\n"
  2224. + "punpckub.h\t%[tmp4], %[q1]:b\n"
  2225. + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2226. + "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
  2227. + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
  2228. + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
  2229. + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
  2230. + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
  2231. + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2232. + "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
  2233. + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
  2234. + "andn\t%[tmp], %[mask2]\n"
  2235. + "and\t%[tmp2], %[q1], %[mask2]\n"
  2236. + "or\t%[tmp], %[tmp2]\n"
  2237. + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
  2238. + [tmp4]"=&r"(tmp4)
  2239. + : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p),
  2240. + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
  2241. + ST32(pix + stride, tmp);
  2242. + tc += 0x00010001;
  2243. + }
  2244. +
  2245. + uint32_t old_p0 = p0;
  2246. + uint32_t old_q0 = q0;
  2247. +
  2248. + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
  2249. + pix[-stride] = clip_uint8( p0 + i_delta );
  2250. + pix[0] = clip_uint8( q0 - i_delta ); */
  2251. +
  2252. + asm (
  2253. + /* Check if the two upper pixels should be filtered */
  2254. + "lsr\t%[tmp], %[inv_mask], 16\n"
  2255. + "breq\t0f\n"
  2256. +
  2257. + "punpckub.h\t%[tmp], %[p1]:t\n"
  2258. + "punpckub.h\t%[tmp2], %[q1]:t\n"
  2259. +
  2260. + /* p1 - q1 */
  2261. + "psub.h\t%[tmp], %[tmp], %[tmp2]\n"
  2262. +
  2263. + "punpckub.h\t%[tmp3], %[q0]:t\n"
  2264. + "punpckub.h\t%[tmp4], %[p0]:t\n"
  2265. +
  2266. + /* q0 - p0 */
  2267. + "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n"
  2268. +
  2269. + /* (q0 - p0) << 2 */
  2270. + "plsl.h\t%[tmp2], %[tmp2], 2\n"
  2271. +
  2272. + /* ((q0 - p0) << 2) + (p1 - q1) */
  2273. + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
  2274. +
  2275. + "mov\t%[tmp], 0x00040004\n"
  2276. + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
  2277. + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
  2278. +
  2279. + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
  2280. + "pasr.h\t%[tmp2], %[tmp2], 3\n"
  2281. +
  2282. + "mov\t%[tmp], 0\n"
  2283. + "psub.h\t%[tmp], %[tmp], %[tc]\n"
  2284. +
  2285. + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
  2286. + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
  2287. + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
  2288. +
  2289. +
  2290. + /* pix[-stride] = clip_uint8( p0 + i_delta ); */
  2291. + "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n"
  2292. +
  2293. +
  2294. + /* pix[0] = clip_uint8( q0 - i_delta ); */
  2295. + "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n"
  2296. +
  2297. + /* Check if the two lower pixels should be filtered */
  2298. + "lsl\t%[tmp2], %[inv_mask], 16\n"
  2299. + "breq\t1f\n"
  2300. +
  2301. + "0:\n"
  2302. + "punpckub.h\t%[p1], %[p1]:b\n"
  2303. + "punpckub.h\t%[q1], %[q1]:b\n"
  2304. +
  2305. + /* p1 - q1 */
  2306. + "psub.h\t%[p1], %[p1], %[q1]\n"
  2307. +
  2308. + "punpckub.h\t%[q0], %[q0]:b\n"
  2309. + "punpckub.h\t%[p0], %[p0]:b\n"
  2310. +
  2311. + /* q0 - p0 */
  2312. + "psub.h\t%[tmp2], %[q0], %[p0]\n"
  2313. +
  2314. + /* (q0 - p0) << 2 */
  2315. + "plsl.h\t%[tmp2], %[tmp2], 2\n"
  2316. +
  2317. + /* ((q0 - p0) << 2) + (p1 - q1) */
  2318. + "padd.h\t%[tmp2], %[tmp2], %[p1]\n"
  2319. +
  2320. + "mov\t%[q1], 0x00040004\n"
  2321. + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
  2322. + "padd.h\t%[tmp2], %[tmp2], %[q1]\n"
  2323. +
  2324. + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
  2325. + "pasr.h\t%[tmp2], %[tmp2], 3\n"
  2326. +
  2327. + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
  2328. + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
  2329. + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
  2330. +
  2331. + /* pix[-stride] = clip_uint8( p0 + i_delta ); */
  2332. + "padd.h\t%[p0], %[p0], %[tmp2]\n"
  2333. +
  2334. + /* pix[0] = clip_uint8( q0 - i_delta ); */
  2335. + "psub.h\t%[q0], %[q0], %[tmp2]\n"
  2336. +
  2337. + "1:\n"
  2338. + "packsh.ub\t%[p0], %[tmp4], %[p0]\n"
  2339. + "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n"
  2340. +
  2341. + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
  2342. + [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1)
  2343. + : [tc]"r"(tc), [inv_mask]"r"(~mask));
  2344. +
  2345. + ST32(pix - stride, (mask & old_p0) | (p0 & ~mask));
  2346. + ST32(pix, (mask & old_q0) | (q0 & ~mask));
  2347. +
  2348. + }
  2349. + pix += 1;
  2350. +}
  2351. +
  2352. +
  2353. +
  2354. +
  2355. +#ifdef CHECK_DSP_FUNCS_AGAINST_C
  2356. +
  2357. +void dump_block8(uint8_t *block, int line_size, int h){
  2358. + int i, j;
  2359. +
  2360. + for ( i = 0; i < h ; i++ ){
  2361. + av_log(NULL, AV_LOG_ERROR, "\t");
  2362. + for ( j = 0; j < 8 ; j++ ){
  2363. + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
  2364. + }
  2365. + av_log(NULL, AV_LOG_ERROR, "\n");
  2366. + }
  2367. +}
  2368. +
  2369. +void dump_block4(uint8_t *block, int line_size, int h){
  2370. + int i, j;
  2371. +
  2372. + for ( i = 0; i < h ; i++ ){
  2373. + av_log(NULL, AV_LOG_ERROR, "\t");
  2374. + for ( j = 0; j < 4 ; j++ ){
  2375. + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
  2376. + }
  2377. + av_log(NULL, AV_LOG_ERROR, "\n");
  2378. + }
  2379. +}
  2380. +
  2381. +void dump_block(uint8_t *block, int line_size, int h, int w){
  2382. + int i, j;
  2383. +
  2384. + for ( i = 0; i < h ; i++ ){
  2385. + av_log(NULL, AV_LOG_ERROR, "\t");
  2386. + for ( j = 0; j < w ; j++ ){
  2387. + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
  2388. + }
  2389. + av_log(NULL, AV_LOG_ERROR, "\n");
  2390. + }
  2391. +}
  2392. +
  2393. +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  2394. + int h, char *name, int max_dev){
  2395. + int i,j;
  2396. + for ( i = 0; i < 8 ; i++ ){
  2397. + for ( j = 0; j < h ; j++ ){
  2398. + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
  2399. + diff = diff < 0 ? -diff : diff;
  2400. + if ( diff > max_dev ){
  2401. + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
  2402. + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
  2403. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
  2404. + dump_block8(test, line_size_test, h);
  2405. + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
  2406. + dump_block8(correct, line_size_correct, h);
  2407. + exit(1);
  2408. + }
  2409. + }
  2410. + }
  2411. +}
  2412. +
  2413. +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  2414. + int h, char *name, int max_dev){
  2415. + int i,j;
  2416. + for ( i = 0; i < 4 ; i++ ){
  2417. + for ( j = 0; j < h ; j++ ){
  2418. + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
  2419. + diff = diff < 0 ? -diff : diff;
  2420. + if ( diff > max_dev ){
  2421. + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
  2422. + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
  2423. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
  2424. + dump_block8(test, line_size_test, h);
  2425. + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
  2426. + dump_block4(correct, line_size_correct, h);
  2427. + exit(1);
  2428. + }
  2429. + }
  2430. + }
  2431. +}
  2432. +
  2433. +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  2434. + int h, int width, char *name, int max_dev){
  2435. + int i,j;
  2436. + for ( i = 0; i < width ; i++ ){
  2437. + for ( j = 0; j < h ; j++ ){
  2438. + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
  2439. + diff = diff < 0 ? -diff : diff;
  2440. + if ( diff > max_dev ){
  2441. + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
  2442. + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
  2443. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
  2444. + dump_block(test, line_size_test, h, width);
  2445. + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
  2446. + dump_block(correct, line_size_correct, h, width);
  2447. + exit(1);
  2448. + }
  2449. + }
  2450. + }
  2451. +}
  2452. +
  2453. +void dump_dct_block(DCTELEM *block){
  2454. + int i, j;
  2455. +
  2456. + for ( i = 0; i < 8 ; i++ ){
  2457. + av_log(NULL, AV_LOG_ERROR, "\t");
  2458. + for ( j = 0; j < 8 ; j++ ){
  2459. + av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]);
  2460. + }
  2461. + av_log(NULL, AV_LOG_ERROR, "\n");
  2462. + }
  2463. +}
  2464. +
  2465. +void test_idct_avr32(DCTELEM *block){
  2466. + DCTELEM testBlock[64];
  2467. + int i, j;
  2468. +
  2469. + /* Copy transposed block to testBlock */
  2470. + for ( i = 0; i < 8 ; i++ ){
  2471. + for ( j = 0; j < 8 ; j++ ){
  2472. + testBlock[i + 8*j] = block[j + i*8];
  2473. + }
  2474. + }
  2475. +
  2476. + idct_avr32(block);
  2477. + simple_idct(&testBlock);
  2478. +
  2479. + for ( i = 0; i < 64 ; i++ ){
  2480. + if ( block[i] != testBlock[i] ){
  2481. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n");
  2482. + dump_dct_block(block);
  2483. + av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n");
  2484. + dump_dct_block(testBlock);
  2485. + exit(1);
  2486. + }
  2487. + }
  2488. +}
  2489. +
  2490. +void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){
  2491. + uint8_t testBlock[64];
  2492. + DCTELEM blockCopy[64];
  2493. + int i, j;
  2494. +
  2495. + /* Copy transposed block to blockCopy */
  2496. + for ( i = 0; i < 8 ; i++ ){
  2497. + for ( j = 0; j < 8 ; j++ ){
  2498. + blockCopy[i + 8*j] = block[j + i*8];
  2499. + }
  2500. + }
  2501. +
  2502. + idct_put_avr32(dest, line_size, block);
  2503. + simple_idct_put(&testBlock, 8, blockCopy);
  2504. +
  2505. + check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1);
  2506. +}
  2507. +
  2508. +
  2509. +void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){
  2510. + uint8_t testBlock[64];
  2511. + DCTELEM blockCopy[64];
  2512. + int i, j;
  2513. +
  2514. + /* Copy dest to testBlock */
  2515. + for ( i = 0; i < 8 ; i++ ){
  2516. + for ( j = 0; j < 8 ; j++ ){
  2517. + testBlock[i + 8*j] = dest[i + j*line_size];
  2518. + }
  2519. + }
  2520. +
  2521. + /* Copy transposed block to blockCopy */
  2522. + for ( i = 0; i < 8 ; i++ ){
  2523. + for ( j = 0; j < 8 ; j++ ){
  2524. + blockCopy[i + 8*j] = block[j + i*8];
  2525. + }
  2526. + }
  2527. +
  2528. + idct_add_avr32(dest, line_size, block);
  2529. + simple_idct_add(&testBlock, 8, blockCopy);
  2530. +
  2531. + check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1);
  2532. +}
  2533. +
  2534. +void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
  2535. + uint8_t testBlock[16];
  2536. + DCTELEM blockCopy[16];
  2537. + int i, j;
  2538. +
  2539. + /* Copy dest to testBlock */
  2540. + for ( i = 0; i < 4 ; i++ ){
  2541. + for ( j = 0; j < 4 ; j++ ){
  2542. + testBlock[i + 4*j] = dest[i + j*stride];
  2543. + }
  2544. + }
  2545. +
  2546. + /* Copy transposed block to blockCopy */
  2547. + for ( i = 0; i < 16 ; i++ ){
  2548. + blockCopy[i] = block[i];
  2549. + }
  2550. +
  2551. + ff_h264_idct_add_c(dest, block, stride);
  2552. +
  2553. + h264_idct_add_avr32(testBlock, blockCopy, 4);
  2554. +
  2555. + check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0);
  2556. +}
  2557. +
  2558. +void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
  2559. + uint8_t testBlock[8*8];
  2560. + DCTELEM blockCopy[8*8];
  2561. + int i, j;
  2562. +
  2563. + /* Copy dest to testBlock */
  2564. + for ( i = 0; i < 8 ; i++ ){
  2565. + for ( j = 0; j < 8 ; j++ ){
  2566. + testBlock[i + 8*j] = dest[i + j*stride];
  2567. + }
  2568. + }
  2569. +
  2570. + /* Copy source block to blockCopy */
  2571. + for ( i = 0; i < 8*8 ; i++ ){
  2572. + blockCopy[i] = block[i];
  2573. + }
  2574. +
  2575. + ff_h264_idct8_add_c(dest, block, stride);
  2576. + h264_idct8_add_avr32(testBlock, blockCopy, 8);
  2577. +
  2578. + check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0);
  2579. +}
  2580. +
  2581. +void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block,
  2582. + const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){
  2583. + uint8_t *testBlock, *testBlock2;
  2584. + int i, j;
  2585. + int input_v_size = h + in_v_size;
  2586. + int input_h_size = 8 + in_h_size;
  2587. +
  2588. + testBlock = alloca(input_h_size*input_v_size);
  2589. + testBlock2 = alloca(input_h_size*input_v_size);
  2590. +
  2591. + for ( i = 0; i < input_h_size ; i++ ){
  2592. + for ( j = 0; j < input_v_size ; j++ ){
  2593. + testBlock[i + input_h_size*j] = pixels[i + j*line_size];
  2594. + }
  2595. + }
  2596. +
  2597. + test(block, pixels, line_size, h);
  2598. + correct(testBlock2, testBlock, input_h_size, h);
  2599. +
  2600. + check_block8(block, testBlock2, line_size, input_h_size, h, name, 0);
  2601. +
  2602. +}
  2603. +
  2604. +void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst,
  2605. + uint8_t *src, int stride, int h, int w, int x, int y, char *name){
  2606. + uint8_t *testBlock, *testBlock2;
  2607. + int i, j;
  2608. + int input_v_size = h + 1;
  2609. + int input_h_size = ((w + 1) + 3) & ~3;
  2610. +
  2611. + testBlock = alloca(input_h_size*input_v_size);
  2612. + testBlock2 = alloca(input_h_size*input_v_size);
  2613. +
  2614. + for ( i = 0; i < w + 1 ; i++ ){
  2615. + for ( j = 0; j < h + 1 ; j++ ){
  2616. + testBlock[i + input_h_size*j] = src[i + j*stride];
  2617. + }
  2618. + }
  2619. +
  2620. + for ( i = 0; i < w ; i++ ){
  2621. + for ( j = 0; j < h ; j++ ){
  2622. + testBlock2[i + input_h_size*j] = dst[i + j*stride];
  2623. + }
  2624. + }
  2625. +
  2626. + test(dst, src, stride, h, x, y);
  2627. + correct(testBlock2, testBlock, input_h_size, h, x, y);
  2628. +
  2629. + check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0);
  2630. +
  2631. +}
  2632. +
  2633. +void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst,
  2634. + uint8_t *src, int stride, int size, char *name){
  2635. + uint8_t *testBlock, *testBlock2;
  2636. + int i, j;
  2637. + int test_stride = size + 8;
  2638. +
  2639. + testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4;
  2640. + testBlock2 = alloca(test_stride*size);
  2641. +
  2642. + for ( i = -4; i < size+4 ; i++ ){
  2643. + for ( j = -4; j < size+4 ; j++ ){
  2644. + testBlock[i + test_stride*j] = src[i + j*stride];
  2645. + }
  2646. + }
  2647. +
  2648. + for ( i = 0; i < size ; i++ ){
  2649. + for ( j = 0; j < size ; j++ ){
  2650. + testBlock2[i + test_stride*j] = dst[i + j*stride];
  2651. + }
  2652. + }
  2653. +
  2654. + correct(dst, src, stride);
  2655. + test(testBlock2, testBlock, test_stride);
  2656. +
  2657. + check_block(testBlock2, dst, test_stride, stride, size, size, name, 0);
  2658. +
  2659. +}
  2660. +
  2661. +
  2662. +#define test_pixels_funcs(PFX, NUM ) \
  2663. +void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2664. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \
  2665. + block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \
  2666. +void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2667. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \
  2668. + block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \
  2669. +void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2670. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \
  2671. + block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \
  2672. +void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2673. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \
  2674. + block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); }
  2675. +
  2676. +test_pixels_funcs(put, 8);
  2677. +test_pixels_funcs(put_no_rnd, 8);
  2678. +test_pixels_funcs(put, 16);
  2679. +test_pixels_funcs(put_no_rnd, 16);
  2680. +
  2681. +test_pixels_funcs(avg, 8);
  2682. +test_pixels_funcs(avg_no_rnd, 8);
  2683. +test_pixels_funcs(avg, 16);
  2684. +test_pixels_funcs(avg_no_rnd, 16);
  2685. +
  2686. +#define test_h264_chroma_mc_funcs(PFX, NUM ) \
  2687. +void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \
  2688. + test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \
  2689. + dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \
  2690. +
  2691. +test_h264_chroma_mc_funcs(put, 2);
  2692. +test_h264_chroma_mc_funcs(put, 4);
  2693. +test_h264_chroma_mc_funcs(put, 8);
  2694. +test_h264_chroma_mc_funcs(avg, 2);
  2695. +test_h264_chroma_mc_funcs(avg, 4);
  2696. +test_h264_chroma_mc_funcs(avg, 8);
  2697. +
  2698. +#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \
  2699. +void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \
  2700. + test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \
  2701. + dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); }
  2702. +
  2703. +#define test_qpel_mc_funcs(PFX, NUM) \
  2704. + test_qpel_mc_funcs_type(PFX, NUM, mc00);\
  2705. + test_qpel_mc_funcs_type(PFX, NUM, mc10);\
  2706. + test_qpel_mc_funcs_type(PFX, NUM, mc20);\
  2707. + test_qpel_mc_funcs_type(PFX, NUM, mc30);\
  2708. + test_qpel_mc_funcs_type(PFX, NUM, mc01);\
  2709. + test_qpel_mc_funcs_type(PFX, NUM, mc11);\
  2710. + test_qpel_mc_funcs_type(PFX, NUM, mc21);\
  2711. + test_qpel_mc_funcs_type(PFX, NUM, mc31);\
  2712. + test_qpel_mc_funcs_type(PFX, NUM, mc02);\
  2713. + test_qpel_mc_funcs_type(PFX, NUM, mc12);\
  2714. + test_qpel_mc_funcs_type(PFX, NUM, mc22);\
  2715. + test_qpel_mc_funcs_type(PFX, NUM, mc32);\
  2716. + test_qpel_mc_funcs_type(PFX, NUM, mc03);\
  2717. + test_qpel_mc_funcs_type(PFX, NUM, mc13);\
  2718. + test_qpel_mc_funcs_type(PFX, NUM, mc23);\
  2719. + test_qpel_mc_funcs_type(PFX, NUM, mc33)
  2720. +
  2721. +test_qpel_mc_funcs(put_h264_qpel, 4);
  2722. +test_qpel_mc_funcs(put_h264_qpel, 8);
  2723. +test_qpel_mc_funcs(put_h264_qpel, 16);
  2724. +test_qpel_mc_funcs(avg_h264_qpel, 4);
  2725. +test_qpel_mc_funcs(avg_h264_qpel, 8);
  2726. +test_qpel_mc_funcs(avg_h264_qpel, 16);
  2727. +
  2728. +
  2729. +#define dspfunc(PFX, IDX, NUM) \
  2730. + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
  2731. + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
  2732. + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
  2733. + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
  2734. + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
  2735. + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
  2736. + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
  2737. + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
  2738. + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
  2739. + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
  2740. + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
  2741. + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
  2742. + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
  2743. + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
  2744. + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
  2745. + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
  2746. +
  2747. +#endif
  2748. +
  2749. +void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx)
  2750. +{
  2751. +
  2752. + /* H264 */
  2753. +
  2754. + if ( 0 /*avr32_use_pico*/ ){
  2755. + c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico);
  2756. + c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico);
  2757. + c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico);
  2758. +
  2759. + c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico);
  2760. + c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico);
  2761. + c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico);
  2762. + }
  2763. +
  2764. +#define dspfunc(PFX, IDX, NUM) \
  2765. + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
  2766. + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
  2767. + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
  2768. + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
  2769. + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
  2770. + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
  2771. + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
  2772. + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
  2773. + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
  2774. + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
  2775. + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
  2776. + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
  2777. + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
  2778. + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
  2779. + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
  2780. + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
  2781. +
  2782. + if ( avr32_use_pico ){
  2783. + dspfunc(put_h264_qpel, 0, 16);
  2784. + dspfunc(put_h264_qpel, 1, 8);
  2785. + dspfunc(put_h264_qpel, 2, 4);
  2786. + dspfunc(avg_h264_qpel, 0, 16);
  2787. + dspfunc(avg_h264_qpel, 1, 8);
  2788. + dspfunc(avg_h264_qpel, 2, 4);
  2789. + }
  2790. +
  2791. + c->idct_put= DSP_FUNC_NAME(idct_put_avr32);
  2792. + c->idct_add= DSP_FUNC_NAME(idct_add_avr32);
  2793. + c->idct = DSP_FUNC_NAME(idct_avr32);
  2794. + c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32);
  2795. + c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32);
  2796. +
  2797. + /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/
  2798. +
  2799. + c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
  2800. +
  2801. + c->fdct = fdct_avr32;
  2802. +
  2803. + c->clear_blocks = clear_blocks_avr32;
  2804. +
  2805. +#undef dspfunc
  2806. +#define dspfunc(PFX, IDX, NUM) \
  2807. + c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \
  2808. + c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \
  2809. + c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \
  2810. + c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32)
  2811. +
  2812. + dspfunc(put, 0, 16);
  2813. + dspfunc(put_no_rnd, 0, 16);
  2814. + dspfunc(put, 1, 8);
  2815. + dspfunc(put_no_rnd, 1, 8);
  2816. +
  2817. + dspfunc(avg, 1, 8);
  2818. + dspfunc(avg_no_rnd, 1, 8);
  2819. + dspfunc(avg, 0, 16);
  2820. + dspfunc(avg_no_rnd, 0, 16);
  2821. +#undef dspfunc
  2822. +
  2823. +}
  2824. +
  2825. +
  2826. +
  2827. +#if 0
  2828. +int main(int argc, char *argv[]){
  2829. +
  2830. +
  2831. +}
  2832. +#endif
  2833. +
  2834. diff --git a/libavcodec/avr32/fdct.S b/libavcodec/avr32/fdct.S
  2835. new file mode 100644
  2836. index 0000000..be45b86
  2837. --- /dev/null
  2838. +++ b/libavcodec/avr32/fdct.S
  2839. @@ -0,0 +1,541 @@
  2840. +/*
  2841. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  2842. + *
  2843. + * Redistribution and use in source and binary forms, with or without
  2844. + * modification, are permitted provided that the following conditions
  2845. + * are met:
  2846. + *
  2847. + * 1. Redistributions of source code must retain the above copyright
  2848. + * notice, this list of conditions and the following disclaimer.
  2849. + *
  2850. + * 2. Redistributions in binary form must reproduce the above
  2851. + * copyright notice, this list of conditions and the following
  2852. + * disclaimer in the documentation and/or other materials provided
  2853. + * with the distribution.
  2854. + *
  2855. + * 3. The name of ATMEL may not be used to endorse or promote products
  2856. + * derived from this software without specific prior written
  2857. + * permission.
  2858. + *
  2859. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  2860. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  2861. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  2862. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  2863. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  2864. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  2865. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  2866. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  2867. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  2868. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  2869. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  2870. + * DAMAGE.
  2871. + */
  2872. +
  2873. +//**********************************************************
  2874. +//* 2-D fDCT, Based on: *
  2875. +//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical *
  2876. +//* Fast 1-D DCT Algorithms with 11 Multiplications", *
  2877. +//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal *
  2878. +//* Processing 1989 (ICASSP '89), pp. 988-991. *
  2879. +//* *
  2880. +//* Fixed point implementation optimized for the AVR-II *
  2881. +//* instruction set. If a table is used for the *
  2882. +//* coeffisients we can load two and two of them from *
  2883. +//* This will give a reduction of
  2884. +//* *
  2885. +//* *
  2886. +//**********************************************************
  2887. +
  2888. +
  2889. +/* This routine is a slow-but-accurate integer implementation of the
  2890. + * forward DCT (Discrete Cosine Transform). Taken from the IJG software
  2891. + *
  2892. + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  2893. + * on each column. Direct algorithms are also available, but they are
  2894. + * much more complex and seem not to be any faster when reduced to code.
  2895. + *
  2896. + * This implementation is based on an algorithm described in
  2897. + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  2898. + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  2899. + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  2900. + * The primary algorithm described there uses 11 multiplies and 29 adds.
  2901. + * We use their alternate method with 12 multiplies and 32 adds.
  2902. + * The advantage of this method is that no data path contains more than one
  2903. + * multiplication; this allows a very simple and accurate implementation in
  2904. + * scaled fixed-point arithmetic, with a minimal number of shifts.
  2905. + *
  2906. + * The poop on this scaling stuff is as follows:
  2907. + *
  2908. + * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
  2909. + * larger than the true DCT outputs. The final outputs are therefore
  2910. + * a factor of N larger than desired; since N=8 this can be cured by
  2911. + * a simple right shift at the end of the algorithm. The advantage of
  2912. + * this arrangement is that we save two multiplications per 1-D DCT,
  2913. + * because the y0 and y4 outputs need not be divided by sqrt(N).
  2914. + * In the IJG code, this factor of 8 is removed by the quantization step
  2915. + * (in jcdctmgr.c), here it is removed.
  2916. + *
  2917. + * We have to do addition and subtraction of the integer inputs, which
  2918. + * is no problem, and multiplication by fractional constants, which is
  2919. + * a problem to do in integer arithmetic. We multiply all the constants
  2920. + * by CONST_SCALE and convert them to integer constants (thus retaining
  2921. + * CONST_BITS bits of precision in the constants). After doing a
  2922. + * multiplication we have to divide the product by CONST_SCALE, with proper
  2923. + * rounding, to produce the correct output. This division can be done
  2924. + * cheaply as a right shift of CONST_BITS bits. We postpone shifting
  2925. + * as long as possible so that partial sums can be added together with
  2926. + * full fractional precision.
  2927. + *
  2928. + * The outputs of the first pass are scaled up by PASS1_BITS bits so that
  2929. + * they are represented to better-than-integral precision. These outputs
  2930. + * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word
  2931. + * with the recommended scaling. (For 12-bit sample data, the intermediate
  2932. + * array is INT32 anyway.)
  2933. + *
  2934. + * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  2935. + * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis
  2936. + * shows that the values given below are the most effective.
  2937. + *
  2938. + * We can gain a little more speed, with a further compromise in accuracy,
  2939. + * by omitting the addition in a descaling shift. This yields an incorrectly
  2940. + * rounded result half the time...
  2941. + */
  2942. +
  2943. + .global fdct_avr32
  2944. +
  2945. +
  2946. +
  2947. +#define CONST_BITS 13
  2948. +#define PASS1_BITS 2
  2949. +
  2950. +#define FIX_0_298631336 2446 /* FIX(0.298631336) */
  2951. +#define FIX_0_390180644 3196 /* FIX(0.390180644) */
  2952. +#define FIX_0_541196100 4433 /* FIX(0.541196100) */
  2953. +#define FIX_0_765366865 6270 /* FIX(0.765366865) */
  2954. +#define FIX_0_899976223 7373 /* FIX(0.899976223) */
  2955. +#define FIX_1_175875602 9633 /* FIX(1.175875602) */
  2956. +#define FIX_1_501321110 12299 /* FIX(1.501321110) */
  2957. +#define FIX_1_847759065 15137 /* FIX(1.847759065) */
  2958. +#define FIX_1_961570560 16069 /* FIX(1.961570560) */
  2959. +#define FIX_2_053119869 16819 /* FIX(2.053119869) */
  2960. +#define FIX_2_562915447 20995 /* FIX(2.562915447) */
  2961. +#define FIX_3_072711026 25172 /* FIX(3.072711026) */
  2962. +
  2963. +
  2964. +/*
  2965. + * Perform an integer forward DCT on one block of samples.
  2966. + */
  2967. +
  2968. +//void
  2969. +//fdct_int32(short *const block)
  2970. +//{
  2971. +// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  2972. +// int tmp10, tmp11, tmp12, tmp13;
  2973. +// int z1, z2, z3, z4, z5;
  2974. +// short *blkptr;
  2975. +// int *dataptr;
  2976. +// int data[64];
  2977. +// int i;
  2978. +//
  2979. +// /* Pass 1: process rows. */
  2980. +// /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2981. +// /* furthermore, we scale the results by 2**PASS1_BITS. */
  2982. +//
  2983. +// dataptr = data;
  2984. +// blkptr = block;
  2985. +
  2986. + .text
  2987. +fdct_avr32:
  2988. + pushm r0-r3, r4-r7, lr
  2989. +#define loop_ctr r0
  2990. +#define blkptr r12
  2991. +#define x0 r1
  2992. +#define x1 r2
  2993. +#define x2 r3
  2994. +#define x3 r4
  2995. +#define x4 r5
  2996. +#define x5 r6
  2997. +#define x6 r7
  2998. +#define x7 r8
  2999. +#define tmp0 r5
  3000. +#define tmp7 r2
  3001. +#define tmp1 r3
  3002. +#define tmp6 r4
  3003. +#define tmp2 r9
  3004. +#define tmp5 r8
  3005. +#define tmp3 r7
  3006. +#define tmp4 r6
  3007. +
  3008. +
  3009. + mov loop_ctr, 8
  3010. +// for (i = 0; i < 8; i++) {
  3011. +ROW_LOOP:
  3012. +
  3013. + ldm blkptr, r1, r2, r3, r4
  3014. +
  3015. +// tmp2 = blkptr[2] + blkptr[5];
  3016. +// tmp3 = blkptr[3] + blkptr[4];
  3017. + paddx.h r5, r3, r2
  3018. +// tmp5 = blkptr[2] - blkptr[5];
  3019. +// tmp4 = blkptr[3] - blkptr[4];
  3020. + psubx.h r6, r3, r2
  3021. +// tmp0 = blkptr[0] + blkptr[7];
  3022. +// tmp1 = blkptr[1] + blkptr[6];
  3023. + paddx.h r2, r4, r1
  3024. +// tmp7 = blkptr[0] - blkptr[7];
  3025. +// tmp6 = blkptr[1] - blkptr[6];
  3026. + psubx.h r3, r4, r1
  3027. +
  3028. +// /* Even part per LL&M figure 1 --- note that published figure is faulty;
  3029. +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  3030. +// */
  3031. +
  3032. +#define tmp10 r1
  3033. +#define tmp13 r5
  3034. +#define tmp11 r7
  3035. +#define tmp12 r3
  3036. +#define z1 r9
  3037. +
  3038. +// tmp10 = tmp0 + tmp3;
  3039. +// tmp13 = tmp0 - tmp3;
  3040. + paddsub.h r1, r2:t, r5:b
  3041. +// tmp11 = tmp1 + tmp2;
  3042. +// tmp12 = tmp1 - tmp2;
  3043. + paddsub.h r4, r2:b, r5:t
  3044. +
  3045. +
  3046. +// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
  3047. +// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;
  3048. + paddsub.h r7, r1:t, r4:t
  3049. + ld.w r10, pc[const_table - .]
  3050. + plsl.h r7, r7, PASS1_BITS
  3051. +
  3052. +// z1 = (tmp12 + tmp13) * FIX_0_541196100;
  3053. + addhh.w r8, r4:b, r1:b
  3054. + mulhh.w r8, r8:b, r10:t
  3055. +
  3056. +// dataptr[2] =
  3057. +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
  3058. +// dataptr[6] =
  3059. +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);
  3060. + mulhh.w r9, r1:b, r10:b
  3061. + ld.w r10, pc[const_table - . + 4]
  3062. + add r1, r8, r9
  3063. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  3064. +
  3065. + mulhh.w r9, r4:b, r10:t
  3066. + add r4, r8, r9
  3067. + satrnds r4 >> (CONST_BITS - PASS1_BITS), 31
  3068. +
  3069. +
  3070. +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  3071. +// * cK represents cos(K*pi/16).
  3072. +// * i0..i3 in the paper are tmp4..tmp7 here.
  3073. +// */
  3074. +
  3075. +#define z2 r5
  3076. +#define z3 r6
  3077. +#define z4 r7
  3078. +#define z5 r8
  3079. +
  3080. +// z4 = tmp5 + tmp7;
  3081. +// z3 = tmp4 + tmp6;
  3082. + padd.h r2, r6, r3
  3083. +// z2 = tmp5 + tmp6;
  3084. +// z1 = tmp4 + tmp7;
  3085. + paddx.h r5, r6, r3
  3086. +
  3087. + lddpc r9, pc[const_table - . + 8]
  3088. +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
  3089. + addhh.w r8, r2:t, r2:b
  3090. + mulhh.w r8, r8:b, r10:b
  3091. + lddpc r10, pc[const_table - . + 12]
  3092. +
  3093. +
  3094. +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
  3095. + mulhh.w r11, r6:b, r9:t
  3096. +
  3097. +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
  3098. + mulhh.w r6, r6:t, r9:b
  3099. +
  3100. +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
  3101. + lddpc r9, pc[const_table - . + 20]
  3102. + mulhh.w lr, r3:b, r10:t
  3103. +
  3104. +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
  3105. + mulhh.w r3, r3:t, r10:b
  3106. +
  3107. +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
  3108. + mulhh.w r10, r2:b, r9:t
  3109. +
  3110. +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
  3111. + mulhh.w r2, r2:t, r9:b
  3112. + lddpc r9, pc[const_table - . + 16]
  3113. +// z3 += z5;
  3114. +// z4 += z5;
  3115. + add r10, r8
  3116. + add r2, r8
  3117. +
  3118. +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
  3119. + mulhh.w r8, r5:b, r9:t
  3120. +
  3121. +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
  3122. + mulhh.w r5, r5:t, r9:b
  3123. +
  3124. +// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
  3125. + add r11, r8
  3126. + add r11, r10
  3127. + satrnds r11 >> (CONST_BITS - PASS1_BITS), 31
  3128. +
  3129. +// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
  3130. + add r6, r5
  3131. +
  3132. + sthh.w blkptr[6*2], r4:b, r11:b
  3133. + add r6, r2
  3134. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  3135. +
  3136. +// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
  3137. + add lr, r5
  3138. + sthh.w blkptr[4*2], r7:b, r6:b
  3139. + add lr, r10
  3140. + satrnds lr >> (CONST_BITS - PASS1_BITS), 31
  3141. +
  3142. +// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
  3143. + add r3, r8
  3144. + sthh.w blkptr[2*2], r1:b, lr:b
  3145. + add r3, r2
  3146. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  3147. +
  3148. +
  3149. +
  3150. +// dataptr += 8; /* advance pointer to next row */
  3151. +// blkptr += 8;
  3152. + sthh.w blkptr[0], r7:t, r3:b
  3153. + sub blkptr, -16
  3154. + sub loop_ctr, 1
  3155. + brne ROW_LOOP
  3156. +
  3157. +// }
  3158. +
  3159. + /* Pass 2: process columns.
  3160. + * We remove the PASS1_BITS scaling, but leave the results scaled up
  3161. + * by an overall factor of 8.
  3162. + */
  3163. +
  3164. +// dataptr = data;
  3165. + sub blkptr, 128
  3166. +
  3167. + mov loop_ctr, 4
  3168. +// for (i = 0; i < 8; i++) {
  3169. +COLOUMN_LOOP:
  3170. + ld.w r1, blkptr[0]
  3171. + ld.w r2, blkptr[1*8*2]
  3172. + ld.w r3, blkptr[2*8*2]
  3173. + ld.w r4, blkptr[3*8*2]
  3174. + ld.w r5, blkptr[4*8*2]
  3175. + ld.w r6, blkptr[5*8*2]
  3176. + ld.w r7, blkptr[6*8*2]
  3177. + ld.w r8, blkptr[7*8*2]
  3178. +
  3179. +// tmp0 = blkptr[0] + blkptr[7*8];
  3180. + padds.sh r9, r1, r8
  3181. +// tmp7 = blkptr[0] - blkptr[7*8];
  3182. + psubs.sh r1, r1, r8
  3183. +// tmp1 = blkptr[1*8] + blkptr[6*8];
  3184. + padds.sh r8, r2, r7
  3185. +// tmp6 = blkptr[1*8] - blkptr[6*8];
  3186. + psubs.sh r2, r2, r7
  3187. +// tmp2 = blkptr[2*8] + blkptr[5*8];
  3188. + padds.sh r7, r3, r6
  3189. +// tmp5 = blkptr[2*8] - blkptr[5*8];
  3190. + psubs.sh r3, r3, r6
  3191. +// tmp3 = blkptr[3*8] + blkptr[4*8];
  3192. + padds.sh r6, r4, r5
  3193. +// tmp4 = blkptr[3*8] - blkptr[4*8];
  3194. + psubs.sh r4, r4, r5
  3195. +
  3196. +// /* even part per ll&m figure 1 --- note that published figure is faulty;
  3197. +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  3198. +// */
  3199. +//
  3200. +// tmp10 = tmp0 + tmp3;
  3201. + padds.sh r5, r9, r6
  3202. +// tmp13 = tmp0 - tmp3;
  3203. + psubs.sh r9, r9, r6
  3204. +// tmp11 = tmp1 + tmp2;
  3205. + padds.sh r6, r8, r7
  3206. +// tmp12 = tmp1 - tmp2;
  3207. + psubs.sh r8, r8, r7
  3208. +
  3209. +// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS);
  3210. +// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS);
  3211. +//Might get an overflow here
  3212. + padds.sh r7, r5, r6
  3213. + psubs.sh r5, r5, r6
  3214. +
  3215. + //Rounding
  3216. + mov lr, (1 << (PASS1_BITS + 2))
  3217. + orh lr, hi(1 << (16 + PASS1_BITS + 2))
  3218. + padds.sh r7, r7, lr
  3219. + padds.sh r5, r5, lr
  3220. +
  3221. + pasr.h r7, r7, PASS1_BITS + 3
  3222. + pasr.h r5, r5, PASS1_BITS + 3
  3223. + st.w r12[0], r7
  3224. + st.w r12[4*8*2], r5
  3225. +
  3226. + lddpc r10, const_table2
  3227. +
  3228. +
  3229. +// z1 = (tmp12 + tmp13) * FIX_0_541196100;
  3230. + padds.sh r5, r8, r9
  3231. + mulhh.w r6, r5:t, r10:t
  3232. + mulhh.w r7, r5:b, r10:t
  3233. +
  3234. +// dataptr[16] =
  3235. +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
  3236. + lddpc r11, const_table2 + 4
  3237. + mulhh.w lr, r9:t, r10:b
  3238. + mulhh.w r9, r9:b, r10:b
  3239. + add lr, r6
  3240. + add r9, r7
  3241. + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
  3242. + satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31
  3243. + sthh.w r12[2*8*2], lr:b, r9:b
  3244. +
  3245. +// dataptr[48] =
  3246. +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);
  3247. + mulhh.w lr, r8:t, r11:t
  3248. + mulhh.w r8, r8:b, r11:t
  3249. + add lr, r6
  3250. + add r8, r7
  3251. + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
  3252. + satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31
  3253. + sthh.w r12[6*8*2], lr:b, r8:b
  3254. +
  3255. +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  3256. +// * cK represents cos(K*pi/16).
  3257. +// * i0..i3 in the paper are tmp4..tmp7 here.
  3258. +// */
  3259. +//
  3260. +// z2 = tmp5 + tmp6;
  3261. +// z3 = tmp4 + tmp6;
  3262. +// z4 = tmp5 + tmp7;
  3263. + padds.sh r5, r3, r2
  3264. + padds.sh r6, r4, r2
  3265. + padds.sh r7, r3, r1
  3266. +
  3267. +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
  3268. + padds.sh r8, r6, r7
  3269. + mulhh.w r9, r8:t, r11:b
  3270. + mulhh.w r8, r8:b, r11:b
  3271. +
  3272. +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
  3273. +// z3 += z5;
  3274. + lddpc r11, const_table2 + 8
  3275. + mulhh.w r10, r6:t, r11:t
  3276. + mulhh.w r6, r6:b, r11:t
  3277. + add r10, r9
  3278. + add r6, r8
  3279. +
  3280. +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
  3281. +// z4 += z5;
  3282. + mulhh.w lr, r7:t, r11:b
  3283. + mulhh.w r7, r7:b, r11:b
  3284. + lddpc r11, const_table2 + 12
  3285. + st.w --sp,r0
  3286. + add lr, r9
  3287. + add r7, r8
  3288. +
  3289. +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
  3290. + mulhh.w r0, r2:t, r11:t
  3291. + machh.w r0, r5:t, r11:b
  3292. + mulhh.w r2, r2:b, r11:t
  3293. + machh.w r2, r5:b, r11:b
  3294. +
  3295. +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
  3296. +// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
  3297. + add r0, r10
  3298. + lddpc r11, const_table2 + 16
  3299. + add r2, r6
  3300. + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
  3301. + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
  3302. + sthh.w r12[3*8*2], r0:b, r2:b
  3303. +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
  3304. + mulhh.w r0, r3:t, r11:t
  3305. + machh.w r0, r5:t, r11:b
  3306. + mulhh.w r2, r3:b, r11:t
  3307. + machh.w r2, r5:b, r11:b
  3308. + add r0, lr
  3309. + lddpc r11, const_table2 + 20
  3310. + add r2, r7
  3311. +
  3312. +// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
  3313. + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
  3314. + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
  3315. + sthh.w r12[5*8*2], r0:b, r2:b
  3316. +
  3317. +
  3318. +// z1 = tmp4 + tmp7;
  3319. + padds.sh r2, r4, r1
  3320. +
  3321. +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
  3322. + mulhh.w r3, r4:t, r11:t
  3323. + machh.w r3, r2:t, r11:b
  3324. + mulhh.w r4, r4:b, r11:t
  3325. + machh.w r4, r2:b, r11:b
  3326. + add r3, r10
  3327. + lddpc r11, const_table2 + 24
  3328. + add r4, r6
  3329. +
  3330. +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
  3331. +// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
  3332. + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
  3333. + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
  3334. + sthh.w r12[7*8*2], r3:b, r4:b
  3335. +
  3336. +
  3337. +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
  3338. + mulhh.w r3, r1:t, r11:t
  3339. + machh.w r3, r2:t, r11:b
  3340. + mulhh.w r4, r1:b, r11:t
  3341. + machh.w r4, r2:b, r11:b
  3342. + add r3, lr
  3343. + add r4, r7
  3344. +
  3345. +// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
  3346. + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
  3347. + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
  3348. + sthh.w r12[1*8*2], r3:b, r4:b
  3349. + ld.w r0, sp++
  3350. +
  3351. +// dataptr++; /* advance pointer to next column */
  3352. + sub blkptr, -4
  3353. + sub loop_ctr, 1
  3354. + brne COLOUMN_LOOP
  3355. +
  3356. +// }
  3357. +
  3358. + popm r0-r3, r4-r7, pc
  3359. +
  3360. +// /* descale */
  3361. +// for (i = 0; i < 64; i++)
  3362. +// block[i] = (short int) DESCALE(data[i], 3);
  3363. +
  3364. +
  3365. +//}
  3366. +
  3367. +
  3368. + .align 2
  3369. +const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
  3370. + .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110
  3371. + .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644
  3372. +
  3373. +const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
  3374. + .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447
  3375. + .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223
  3376. + .short FIX_1_501321110, -FIX_0_899976223
  3377. +
  3378. +
  3379. +
  3380. +
  3381. diff --git a/libavcodec/avr32/h264idct.S b/libavcodec/avr32/h264idct.S
  3382. new file mode 100644
  3383. index 0000000..4b23e2d
  3384. --- /dev/null
  3385. +++ b/libavcodec/avr32/h264idct.S
  3386. @@ -0,0 +1,451 @@
  3387. +/*
  3388. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  3389. + *
  3390. + * Redistribution and use in source and binary forms, with or without
  3391. + * modification, are permitted provided that the following conditions
  3392. + * are met:
  3393. + *
  3394. + * 1. Redistributions of source code must retain the above copyright
  3395. + * notice, this list of conditions and the following disclaimer.
  3396. + *
  3397. + * 2. Redistributions in binary form must reproduce the above
  3398. + * copyright notice, this list of conditions and the following
  3399. + * disclaimer in the documentation and/or other materials provided
  3400. + * with the distribution.
  3401. + *
  3402. + * 3. The name of ATMEL may not be used to endorse or promote products
  3403. + * derived from this software without specific prior written
  3404. + * permission.
  3405. + *
  3406. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  3407. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  3408. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  3409. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  3410. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  3411. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  3412. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  3413. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  3414. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  3415. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  3416. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  3417. + * DAMAGE.
  3418. + */
  3419. +
  3420. + .global h264_idct_add_avr32
  3421. +
  3422. + /* Macro for performing the 1-D transform on one row line.
  3423. +
  3424. + The register 'w01' should contain the first two pixels,
  3425. + and the register 'w23' should contain the last two pixels
  3426. + in the line. The resulting line is placed in p01 and p23
  3427. + so that { w01, w23 } = { x0, x1, x3, x2 }.
  3428. + 'tmp' and 'tmp2' should be scratchpad registers. */
  3429. + .macro transform_row w01, w23, tmp, tmp2
  3430. + add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */
  3431. + sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */
  3432. + bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */
  3433. + pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */
  3434. + paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */
  3435. + padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */
  3436. + psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */
  3437. + .endm
  3438. +
  3439. + /* Macro for performing the 1-D transform on two columns.
  3440. +
  3441. + The registers w0, w1, w2, w3 should each contain two
  3442. + packed samples from the two colomns to transform.
  3443. + tmp and tmp2 are scratchpad registers.
  3444. +
  3445. + The resulting transformed columns are placed in the
  3446. + same positions as the input columns.
  3447. + */
  3448. + .macro transform_2columns w0, w1, w2, w3, tmp, tmp2
  3449. + padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */
  3450. + psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */
  3451. + pasr.h \w2, \w1, 1 /* w2 = w1/2 */
  3452. + pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */
  3453. + psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */
  3454. + padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */
  3455. + padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */
  3456. + psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */
  3457. + padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */
  3458. + psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */
  3459. + /* Scale down result. */
  3460. + pasr.h \w0, \w0, 6
  3461. + pasr.h \w1, \w1, 6
  3462. + pasr.h \w2, \w2, 6
  3463. + pasr.h \w3, \w3, 6
  3464. + .endm
  3465. +
  3466. +/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/
  3467. +
  3468. +h264_idct_add_avr32:
  3469. +
  3470. + stm --sp,r0-r3,r4-r7, lr
  3471. +
  3472. + /* Setup rounding factor. */
  3473. + mov r0, (1 << 5)
  3474. + lsl r0, 16
  3475. +
  3476. + /* Load block */
  3477. + ldm r11,r2-r9
  3478. + /* r9 = { w00, w01 },
  3479. + r8 = { w02, w03 },
  3480. + r7 = { w10, w11 },
  3481. + r6 = { w12, w13 },
  3482. + r5 = { w20, w21 },
  3483. + r4 = { w22, w23 },
  3484. + r3 = { w30, w31 },
  3485. + r2 = { w32, w33 } */
  3486. +
  3487. +
  3488. + /* Add the rounding factor to w00. */
  3489. + add r9, r0
  3490. +
  3491. + /* Transform rows */
  3492. + transform_row r9, r8, r0, r1
  3493. + transform_row r7, r6, r0, r1
  3494. + transform_row r5, r4, r0, r1
  3495. + transform_row r3, r2, r0, r1
  3496. +
  3497. + /* Transform columns */
  3498. + transform_2columns r9, r7, r5, r3, r0, r1
  3499. + transform_2columns r8, r6, r4, r2, r0, r1
  3500. +
  3501. + /* Load predicted pixels.*/
  3502. + ld.w lr, r12[0]
  3503. + ld.w r11, r12[r10]
  3504. +
  3505. + /* Unpack to halwords. */
  3506. + punpckub.h r0, lr:t
  3507. + punpckub.h r1, lr:b
  3508. +
  3509. + /* Add with transformed row. */
  3510. + padd.h r0, r0, r9
  3511. + paddx.h r1, r1, r8
  3512. + /* Pack and saturate back to 8-bit pixels. */
  3513. + packsh.ub r0, r0, r1
  3514. +
  3515. + /* Unpack to halwords. */
  3516. + punpckub.h lr, r11:t
  3517. + punpckub.h r11, r11:b
  3518. +
  3519. + /* Add with transformed row. */
  3520. + padd.h lr, lr, r7
  3521. + paddx.h r11, r11, r6
  3522. + /* Pack and saturate back to 8-bit pixels. */
  3523. + packsh.ub r1, lr, r11
  3524. +
  3525. + /* Store back to frame. */
  3526. + st.w r12[0], r0
  3527. + st.w r12[r10], r1
  3528. +
  3529. + add r12, r12, r10 << 1
  3530. +
  3531. + /* Load predicted pixels.*/
  3532. + ld.w lr, r12[0]
  3533. + ld.w r11, r12[r10]
  3534. +
  3535. + /* Unpack to halwords. */
  3536. + punpckub.h r0, lr:t
  3537. + punpckub.h r1, lr:b
  3538. +
  3539. + /* Add with transformed row. */
  3540. + padd.h r0, r0, r5
  3541. + paddx.h r1, r1, r4
  3542. + /* Pack and saturate back to 8-bit pixels. */
  3543. + packsh.ub r0, r0, r1
  3544. +
  3545. + /* Unpack to halwords. */
  3546. + punpckub.h lr, r11:t
  3547. + punpckub.h r11, r11:b
  3548. +
  3549. + /* Add with transformed row. */
  3550. + padd.h lr, lr, r3
  3551. + paddx.h r11, r11, r2
  3552. + /* Pack and saturate back to 8-bit pixels. */
  3553. + packsh.ub r1, lr, r11
  3554. +
  3555. + /* Store back to frame. */
  3556. + st.w r12[0], r0
  3557. + st.w r12[r10], r1
  3558. +
  3559. + ldm sp++,r0-r3,r4-r7, pc
  3560. +
  3561. +
  3562. + .global h264_idct8_add_avr32
  3563. +//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
  3564. +
  3565. +h264_idct8_add_avr32:
  3566. + stm --sp,r0-r3,r4-r7, lr
  3567. +
  3568. + /* Push dst and stride on stack */
  3569. + stm --sp,r10,r12
  3570. +
  3571. +// int i;
  3572. +// DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
  3573. +// uint8_t *cm = cropTbl + MAX_NEG_CROP;
  3574. +
  3575. +// block[0] += 32;
  3576. +
  3577. +
  3578. +// for( i = 0; i < 8; i++ )
  3579. +// {
  3580. + mov lr, 4
  3581. +0:
  3582. + ld.w r7, r11[0*(8*2)]
  3583. + ld.w r6, r11[1*(8*2)]
  3584. + ld.w r5, r11[2*(8*2)]
  3585. + ld.w r4, r11[3*(8*2)]
  3586. + ld.w r3, r11[4*(8*2)]
  3587. + ld.w r2, r11[5*(8*2)]
  3588. + ld.w r1, r11[6*(8*2)]
  3589. + ld.w r0, r11[7*(8*2)]
  3590. +
  3591. +/*
  3592. +
  3593. + const int a0 = src[0][i] + src[4][i];
  3594. + const int a2 = src[0][i] - src[4][i];
  3595. + const int a4 = (src[2][i]>>1) - src[6][i];
  3596. + const int a6 = (src[6][i]>>1) + src[2][i];
  3597. +*/
  3598. + padd.h r8, r7, r3 /* r8 = a0 */
  3599. + psub.h r7, r7, r3 /* r7 = a2 */
  3600. + pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */
  3601. + pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */
  3602. + psub.h r3, r3, r1 /* r3 = a4 */
  3603. + padd.h r9, r9, r5 /* r9 = a6 */
  3604. +
  3605. +/*
  3606. + const int b0 = a0 + a6;
  3607. + const int b2 = a2 + a4;
  3608. + const int b4 = a2 - a4;
  3609. + const int b6 = a0 - a6;
  3610. +*/
  3611. + padd.h r1, r8, r9 /* r1 = b0 */
  3612. + psub.h r8, r8, r9 /* r8 = b6 */
  3613. + padd.h r5, r7, r3 /* r5 = b2 */
  3614. + psub.h r7, r7, r3 /* r7 = b4 */
  3615. +
  3616. +/*
  3617. + const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
  3618. + const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
  3619. + const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
  3620. + const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
  3621. +*/
  3622. + pasr.h r3, r0, 1
  3623. + padd.h r3, r3, r0
  3624. + psub.h r3, r2, r3
  3625. + psub.h r3, r3, r4 /* r3 = a1 */
  3626. +
  3627. + pasr.h r9, r4, 1
  3628. + padd.h r9, r9, r4
  3629. + psub.h r9, r0, r9
  3630. + padd.h r9, r6, r9 /* r9 = a3 */
  3631. +
  3632. + pasr.h r10, r2, 1
  3633. + padd.h r10, r10, r2
  3634. + padd.h r10, r10, r0
  3635. + psub.h r10, r10, r6 /* r10 = a5 */
  3636. +
  3637. + pasr.h r0, r6, 1
  3638. + padd.h r0, r0, r6
  3639. + padd.h r0, r0, r2
  3640. + padd.h r0, r0, r4 /* r0 = a7 */
  3641. +/*
  3642. + const int b1 = (a7>>2) + a1;
  3643. + const int b3 = a3 + (a5>>2);
  3644. + const int b5 = (a3>>2) - a5;
  3645. + const int b7 = a7 - (a1>>2);
  3646. +*/
  3647. + pasr.h r2, r0, 2
  3648. + padd.h r2, r2, r3 /* r2 = b1 */
  3649. + pasr.h r3, r3, 2
  3650. + psub.h r3, r0, r3 /* r3 = b7 */
  3651. +
  3652. + pasr.h r0, r10, 2
  3653. + padd.h r0, r0, r9 /* r0 = b3 */
  3654. + pasr.h r9, r9, 2
  3655. + psub.h r9, r9, r10 /* r9 = b5 */
  3656. +
  3657. +
  3658. +/*
  3659. + src[0][i] = b0 + b7;
  3660. + src[7][i] = b0 - b7;
  3661. + src[1][i] = b2 + b5;
  3662. + src[6][i] = b2 - b5;
  3663. + src[2][i] = b4 + b3;
  3664. + src[5][i] = b4 - b3;
  3665. + src[3][i] = b6 + b1;
  3666. + src[4][i] = b6 - b1; */
  3667. +
  3668. + padd.h r4, r1, r3
  3669. + psub.h r1, r1, r3
  3670. + st.w r11[0*(8*2)], r4
  3671. + st.w r11[7*(8*2)], r1
  3672. +
  3673. + padd.h r3, r5, r9
  3674. + psub.h r5, r5, r9
  3675. + st.w r11[1*(8*2)], r3
  3676. + st.w r11[6*(8*2)], r5
  3677. +
  3678. + padd.h r9, r7, r0
  3679. + psub.h r7, r7, r0
  3680. + st.w r11[2*(8*2)], r9
  3681. + st.w r11[5*(8*2)], r7
  3682. +
  3683. + padd.h r0, r8, r2
  3684. + psub.h r8, r8, r2
  3685. + st.w r11[3*(8*2)], r0
  3686. + st.w r11[4*(8*2)], r8
  3687. +
  3688. + sub r11, -4
  3689. + sub lr, 1
  3690. + brne 0b
  3691. +
  3692. +// }
  3693. +
  3694. + lddsp r12, sp[0] /* r12 = dst */
  3695. + sub r11, 4*4
  3696. + ldm r11++, r4-r7
  3697. + mov lr, 8
  3698. + /* Push dst and stride on stack */
  3699. +
  3700. +1:
  3701. +// for( i = 0; i < 8; i++ )
  3702. +// {
  3703. +
  3704. + /* r7 = {src[i][0], src[i][1]}
  3705. + r6 = {src[i][2], src[i][3]}
  3706. + r5 = {src[i][4], src[i][5]}
  3707. + r4 = {src[i][6], src[i][7]} */
  3708. +
  3709. +/*
  3710. + const int a0 = src[i][0] + src[i][4];
  3711. + const int a2 = src[i][0] - src[i][4];
  3712. + const int a4 = (src[i][2]>>1) - src[i][6];
  3713. + const int a6 = (src[i][6]>>1) + src[i][2];
  3714. +*/
  3715. + pasr.h r8, r6, 1
  3716. + pasr.h r9, r4, 1
  3717. + addhh.w r0, r7:t, r5:t /* r0 = a0 */
  3718. + subhh.w r1, r7:t, r5:t /* r1 = a2 */
  3719. + subhh.w r2, r8:t, r4:t /* r2 = a4 */
  3720. + addhh.w r3, r9:t, r6:t /* r3 = a6 */
  3721. +
  3722. +/*
  3723. + const int b0 = a0 + a6;
  3724. + const int b2 = a2 + a4;
  3725. + const int b4 = a2 - a4;
  3726. + const int b6 = a0 - a6;
  3727. +*/
  3728. + add r10, r0, r3 /* r10 = b0 */
  3729. + sub r0, r3 /* r0 = b6 */
  3730. + add r3, r1, r2 /* r3 = b2 */
  3731. + sub r1, r2 /* r1 = b4 */
  3732. +/*
  3733. +
  3734. +
  3735. + const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1);
  3736. + const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1);
  3737. + const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1);
  3738. + const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */
  3739. + addhh.w r8, r8:b, r6:b
  3740. + addhh.w r2, r4:b, r7:b
  3741. + sub r2, r8 /* r2 = a3 */
  3742. +
  3743. + addhh.w r9, r9:b, r4:b
  3744. + subhh.w r8, r5:b, r6:b
  3745. + sub r8, r9 /* r8 = a1 */
  3746. +
  3747. + pasr.h r9, r7, 1
  3748. + addhh.w r9, r9:b, r7:b
  3749. + addhh.w r6, r5:b, r6:b
  3750. + add r6, r9 /* r6 = a7 */
  3751. +
  3752. + pasr.h r9, r5, 1
  3753. + addhh.w r9, r9:b, r5:b
  3754. + subhh.w r5, r4:b, r7:b
  3755. + add r5, r9 /* r5 = a5 */
  3756. +
  3757. +/* const int b1 = (a7>>2) + a1;
  3758. + const int b3 = (a5>>2) + a3;
  3759. + const int b5 = (a3>>2) - a5;
  3760. + const int b7 = -(a1>>2) + a7 ; */
  3761. + asr r4, r6, 2
  3762. + add r4, r8 /* r4 = b1 */
  3763. + asr r8, 2
  3764. + rsub r8, r6 /* r8 = b7 */
  3765. +
  3766. + asr r6, r5, 2
  3767. + add r6, r2 /* r6 = b3 */
  3768. + asr r2, 2
  3769. + sub r2, r5 /* r2 = b5 */
  3770. +
  3771. +/*
  3772. + dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ];
  3773. + dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ];
  3774. + dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ];
  3775. + dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ];
  3776. + dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ];
  3777. + dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ];
  3778. + dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ];
  3779. + dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ];
  3780. +*/
  3781. + add r5, r10, r8
  3782. + satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */
  3783. + sub r10, r8
  3784. + satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */
  3785. + add r8, r3, r2
  3786. + satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */
  3787. + sub r3, r2
  3788. + satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */
  3789. +
  3790. + add r2, r1, r6
  3791. + satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */
  3792. + sub r1, r6
  3793. + satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */
  3794. +
  3795. + add r6, r0, r4
  3796. + satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */
  3797. + sub r0, r4
  3798. + satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */
  3799. +
  3800. + ld.w r4, r12[0]
  3801. +
  3802. + packw.sh r8, r5, r8
  3803. + packw.sh r7, r2, r6
  3804. + ld.w r9, r12[4]
  3805. + packw.sh r6, r0, r1
  3806. + packw.sh r5, r3, r10
  3807. +
  3808. + punpckub.h r10, r4:t
  3809. + punpckub.h r4, r4:b
  3810. + punpckub.h r3, r9:t
  3811. + punpckub.h r9, r9:b
  3812. +
  3813. + padd.h r8, r8, r10
  3814. + padd.h r7, r7, r4
  3815. + padd.h r6, r6, r3
  3816. + padd.h r5, r5, r9
  3817. +
  3818. + lddsp r10, sp[4] /* r10 = stride */
  3819. + packsh.ub r0, r8, r7
  3820. + packsh.ub r1, r6, r5
  3821. +
  3822. + st.w r12[0], r0
  3823. + st.w r12[4], r1
  3824. +
  3825. + ldm r11++, r4-r7
  3826. + add r12, r10 /* dst += stride */
  3827. +
  3828. + sub lr, 1
  3829. + brne 1b
  3830. +
  3831. + sub sp, -8
  3832. + ldm sp++,r0-r3,r4-r7, pc
  3833. +
  3834. +
  3835. +
  3836. +// }
  3837. +//}
  3838. diff --git a/libavcodec/avr32/idct.S b/libavcodec/avr32/idct.S
  3839. new file mode 100644
  3840. index 0000000..e7551ec
  3841. --- /dev/null
  3842. +++ b/libavcodec/avr32/idct.S
  3843. @@ -0,0 +1,829 @@
  3844. +/*
  3845. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  3846. + *
  3847. + * Redistribution and use in source and binary forms, with or without
  3848. + * modification, are permitted provided that the following conditions
  3849. + * are met:
  3850. + *
  3851. + * 1. Redistributions of source code must retain the above copyright
  3852. + * notice, this list of conditions and the following disclaimer.
  3853. + *
  3854. + * 2. Redistributions in binary form must reproduce the above
  3855. + * copyright notice, this list of conditions and the following
  3856. + * disclaimer in the documentation and/or other materials provided
  3857. + * with the distribution.
  3858. + *
  3859. + * 3. The name of ATMEL may not be used to endorse or promote products
  3860. + * derived from this software without specific prior written
  3861. + * permission.
  3862. + *
  3863. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  3864. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  3865. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  3866. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  3867. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  3868. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  3869. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  3870. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  3871. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  3872. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  3873. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  3874. + * DAMAGE.
  3875. + */
  3876. +
  3877. + .global idct_add_avr32
  3878. + .global idct_put_avr32
  3879. + .global idct_avr32
  3880. +
  3881. +
  3882. +#define CONST_BITS 13
  3883. +#define PASS1_BITS 2
  3884. +
  3885. +#define ONE ((INT32) 1)
  3886. +
  3887. +#define CONST_SCALE (ONE << CONST_BITS)
  3888. +
  3889. +#define LINE_SIZE 32
  3890. +
  3891. +#define FIX_0_298631336 (2446) /* FIX(0.298631336) */
  3892. +#define FIX_0_390180644 (3196) /* FIX(0.390180644) */
  3893. +#define FIX_0_541196100 (4433) /* FIX(0.541196100) */
  3894. +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
  3895. +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
  3896. +#define FIX_1_175875602 (9633) /* FIX(1.175875602) */
  3897. +#define FIX_1_501321110 (12299)/* FIX(1.501321110) */
  3898. +#define FIX_1_847759065 (15137)/* FIX(1.847759065) */
  3899. +#define FIX_1_961570560 (16069)/* FIX(1.961570560) */
  3900. +#define FIX_2_053119869 (16819)/* FIX(2.053119869) */
  3901. +#define FIX_2_562915447 (20995)/* FIX(2.562915447) */
  3902. +#define FIX_3_072711026 (25172)/* FIX(3.072711026) */
  3903. +
  3904. +
  3905. +#define loop_cnt r11
  3906. +
  3907. + .text
  3908. +
  3909. +idct_add_avr32:
  3910. + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
  3911. +
  3912. + // Give room for some variables on the stack
  3913. + sub sp, 8
  3914. + stdsp SP[0], r12 // rfp
  3915. + stdsp SP[4], r11 // iinc
  3916. +
  3917. + mov loop_cnt, 8 //Initialize loop counter
  3918. +
  3919. +FOR_ROW:
  3920. +
  3921. + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
  3922. + mov r6, 0
  3923. +#ifdef USE_PREFETCH
  3924. + pref r10[LINE_SIZE] //Prefetch next line
  3925. +#endif
  3926. + or r4, r2, r3 << 16
  3927. + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
  3928. + or r4, r0
  3929. + brne AC_ROW //If there are non-zero AC coeffisients perform row-transform
  3930. +
  3931. + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
  3932. + plsl.h r5, r5, PASS1_BITS
  3933. + mov r4, r5
  3934. + st.d r10++, r4
  3935. + st.d r10++, r4
  3936. +
  3937. + sub loop_cnt, 1 //Decrement loop counter
  3938. + brne FOR_ROW //Perform loop one more time if loop_cnt is not zero
  3939. +
  3940. + bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed
  3941. +
  3942. +
  3943. +AC_ROW:
  3944. +
  3945. +
  3946. + ld.w r12, pc[coef_table - .]
  3947. + ld.w r9, pc[coef_table - . + 4]
  3948. +
  3949. + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
  3950. + mulhh.w r5, r4:t, r12:t
  3951. + mulhh.w r6, r0:t, r12:b
  3952. + ld.w r12, pc[coef_table - . + 8]
  3953. + mulhh.w r7, r2:t, r9:t
  3954. + add r6, r5 // tmp2
  3955. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  3956. + add r7, r5 // tmp3
  3957. + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
  3958. +
  3959. + paddsub.h r5, r3:t, r1:t
  3960. + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
  3961. +
  3962. + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
  3963. + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
  3964. +
  3965. +
  3966. + addhh.w lr, r3:b, r1:b // lr = z4
  3967. + addhh.w r5, r4:b, lr:b
  3968. + mulhh.w r5, r5:b, r9:b // r5 = z5
  3969. +
  3970. + ld.w r9, pc[coef_table - . + 12]
  3971. + mulhh.w r4, r4:b, r12:t // r4 = z3
  3972. + mulhh.w lr, lr:b, r12:b // lr = z4
  3973. +
  3974. + add r4, r5
  3975. + add lr, r5
  3976. +
  3977. + addhh.w r5, r2:b, r1:b // r5 = z2
  3978. + addhh.w r8, r3:b, r0:b // r8 = z1
  3979. +
  3980. +
  3981. + mulhh.w r0, r0:b, r9:t // r0 = tmp0
  3982. + ld.w r12, pc[coef_table - . + 16]
  3983. + mulhh.w r1, r1:b, r9:b // r1 = tmp1
  3984. + ld.w r9, pc[coef_table - . + 20]
  3985. + mulhh.w r2, r2:b, r12:t // r2 = tmp2
  3986. + mulhh.w r3, r3:b, r12:b // r3 = tmp3
  3987. + mulhh.w r8, r8:b, r9:t // r8 = z1
  3988. + mulhh.w r5, r5:b, r9:b // r5 = z2
  3989. +
  3990. +
  3991. + add r0, r8
  3992. + add r0, r4
  3993. + add r1, r5
  3994. + add r1, lr
  3995. + add r2, r5
  3996. + add r2, r4
  3997. + add r3, r8
  3998. + add r3, lr
  3999. +
  4000. + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
  4001. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  4002. + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
  4003. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  4004. +
  4005. + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
  4006. + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
  4007. + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
  4008. + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
  4009. +
  4010. + sthh.w r10[0], r4:t, r5:t
  4011. + sthh.w r10[4], r3:t, r2:t
  4012. + sthh.w r10[8], r2:b, r3:b
  4013. + sthh.w r10[12], r5:b, r4:b
  4014. +
  4015. +
  4016. +
  4017. + sub r10, -16
  4018. + sub loop_cnt, 1
  4019. + brne FOR_ROW, e
  4020. +
  4021. +COLOUMN_TRANSFORM:
  4022. +
  4023. + sub r10, 128 //Set pointer to start of DCT block
  4024. +
  4025. +
  4026. + mov loop_cnt, 8
  4027. +FOR_COLOUMN:
  4028. + ldins.h r3:t,r10[0] // r3:t = dataptr[0]
  4029. + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
  4030. + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
  4031. + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
  4032. + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
  4033. + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
  4034. + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
  4035. + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
  4036. +
  4037. + or r4, r1, r3 << 16
  4038. + or r4, r2
  4039. + or r4, r0
  4040. + brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform
  4041. +
  4042. + lddsp r12, SP[0] // rfp
  4043. + lddsp r9, SP[4] // iinc
  4044. + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9
  4045. + ld.d r0, r12[0]
  4046. + sub r10, -2 // Increment the dataptr
  4047. + bfins r3, r3, 16, 16
  4048. + punpckub.h r2, r1:t
  4049. + padd.h r2, r2, r3
  4050. + punpckub.h r1, r1:b
  4051. + padd.h r1, r1, r3
  4052. + packsh.ub r1, r2, r1
  4053. + punpckub.h r2, r0:t
  4054. + padd.h r2, r2, r3
  4055. + punpckub.h r0, r0:b
  4056. + padd.h r0, r0, r3
  4057. + packsh.ub r0, r2, r0
  4058. + st.d r12[0], r0
  4059. + add r12, r9 // increment rfp
  4060. + stdsp SP[0], r12
  4061. +
  4062. + sub loop_cnt, 1//Decrement loop counter
  4063. + brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero
  4064. +
  4065. + sub sp, -8
  4066. + popm r0-r3, r4-r7, pc//Pop back registers and PC
  4067. +
  4068. +AC_COLOUMN:
  4069. +
  4070. + ld.w r12, pc[coef_table - .]
  4071. + ld.w r9, pc[coef_table - . + 4]
  4072. +
  4073. + addhh.w r4, r2:t, r2:b
  4074. + mulhh.w r4, r4:b, r12:t // r4 = z1
  4075. + mulhh.w r5, r2:b, r12:b
  4076. + ld.w r12, pc[coef_table - . + 8]
  4077. + mulhh.w r6, r2:t, r9:t
  4078. + add r5, r4 // r5 = tmp2
  4079. + add r6, r4 // r6 = tmp3
  4080. +
  4081. + addhh.w r7, r3:t, r3:b
  4082. + subhh.w r8, r3:t, r3:b
  4083. +
  4084. + lsl r7, CONST_BITS
  4085. + lsl r8, CONST_BITS
  4086. +
  4087. + add r2, r7, r6 // r2 = tmp10
  4088. + sub r3, r7, r6 // r3 = tmp13
  4089. + add r4, r8, r5 // r4 = tmp11
  4090. + sub r5, r8, r5 // r5 = tmp12
  4091. +
  4092. + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
  4093. + addhh.w r7, r6:t, r6:b
  4094. + mulhh.w r7, r7:b, r9:b // r7 = z5
  4095. +
  4096. + ld.w r9, pc[coef_table - . + 12]
  4097. + mulhh.w r8, r6:b, r12:t // r8 = z3
  4098. + mulhh.w r6, r6:t, r12:b // r6 = z4
  4099. +
  4100. + add r8, r7
  4101. + add r6, r7
  4102. +
  4103. + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
  4104. +
  4105. + mulhh.w r12, r0:b, r9:t // r12 = tmp0
  4106. + mulhh.w r0, r0:t, r9:b // r0 = tmp1
  4107. + ld.w r9, pc[coef_table - . + 16]
  4108. + add r12, r8
  4109. + add r0, r6
  4110. +
  4111. + ld.w lr, pc[coef_table - . + 20]
  4112. + machh.w r8, r1:b, r9:t // r8 = tmp2
  4113. + machh.w r6, r1:t, r9:b // r6 = tmp3
  4114. + mulhh.w r9, r7:b, lr:t // r9 = z1
  4115. + mulhh.w r7, r7:t, lr:b // r7 = z2
  4116. +
  4117. +
  4118. + add r12, r9
  4119. + add r0, r7
  4120. + add r8, r7
  4121. + add r6, r9
  4122. +
  4123. + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
  4124. + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
  4125. + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
  4126. + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
  4127. + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
  4128. + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
  4129. + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
  4130. + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
  4131. +
  4132. + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
  4133. + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
  4134. + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
  4135. + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
  4136. + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
  4137. + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
  4138. + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
  4139. + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
  4140. +
  4141. + packw.sh r1, r1, r6
  4142. + packw.sh r8, r8, r0
  4143. + packw.sh r3, r3, r5
  4144. + packw.sh r4, r4, r2
  4145. +
  4146. + lddsp r12, SP[0] // rfp
  4147. + lddsp r9, SP[4] // iinc
  4148. + ld.d r6, r12[0]
  4149. + sub r10, -2 // Increment the dataptr
  4150. + punpckub.h r0, r7:t
  4151. + padd.h r1, r1, r0
  4152. + punpckub.h r0, r7:b
  4153. + padd.h r8, r8, r0
  4154. + packsh.ub r7, r1, r8
  4155. + punpckub.h r0, r6:t
  4156. + padd.h r3, r3, r0
  4157. + punpckub.h r0, r6:b
  4158. + padd.h r4, r4, r0
  4159. + packsh.ub r6, r3, r4
  4160. + st.d r12[0], r6
  4161. + add r12, r9 // increment rfp
  4162. + stdsp SP[0], r12
  4163. +
  4164. + sub loop_cnt, 1 //Decrement loop counter
  4165. + brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero
  4166. +
  4167. + sub sp, -8
  4168. + popm r0-r3, r4-r7, pc //Pop back registers and PC
  4169. +
  4170. +
  4171. +
  4172. +//Coeffisient Table:
  4173. + .align 2
  4174. +coef_table:
  4175. + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
  4176. + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
  4177. + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
  4178. +
  4179. +
  4180. +idct_put_avr32:
  4181. + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
  4182. +
  4183. + //; Give room for some variables on the stack
  4184. + sub sp, 8
  4185. + stdsp SP[0], r12 // rfp
  4186. + stdsp SP[4], r11 // iinc
  4187. +
  4188. + mov loop_cnt, 8 //Initialize loop counter
  4189. +
  4190. +0:
  4191. +
  4192. + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
  4193. + mov r6, 0
  4194. +#ifdef USE_PREFETCH
  4195. + pref r10[LINE_SIZE] //Prefetch next line
  4196. +#endif
  4197. + or r4, r2, r3 << 16
  4198. + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
  4199. + or r4, r0
  4200. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4201. +
  4202. + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
  4203. + plsl.h r5, r5, PASS1_BITS
  4204. + mov r4, r5
  4205. + st.d r10++, r4
  4206. + st.d r10++, r4
  4207. +
  4208. + sub loop_cnt, 1 //Decrement loop counter
  4209. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4210. +
  4211. + bral 2f //Perform coloumn transform after row transform is computed
  4212. +
  4213. +1:
  4214. +
  4215. + ld.w r12, pc[coef_table_copy - .]
  4216. + ld.w r9, pc[coef_table_copy - . + 4]
  4217. +
  4218. + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
  4219. + mulhh.w r5, r4:t, r12:t
  4220. + mulhh.w r6, r0:t, r12:b
  4221. + ld.w r12, pc[coef_table_copy - . + 8]
  4222. + mulhh.w r7, r2:t, r9:t
  4223. + add r6, r5 // tmp2
  4224. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  4225. + add r7, r5 // tmp3
  4226. + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
  4227. +
  4228. + paddsub.h r5, r3:t, r1:t
  4229. + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
  4230. +
  4231. + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
  4232. + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
  4233. +
  4234. +
  4235. +
  4236. + addhh.w lr, r3:b, r1:b // lr = z4
  4237. + addhh.w r5, r4:b, lr:b
  4238. + mulhh.w r5, r5:b, r9:b // r5 = z5
  4239. +
  4240. + ld.w r9, pc[coef_table_copy - . + 12]
  4241. + mulhh.w r4, r4:b, r12:t // r4 = z3
  4242. + mulhh.w lr, lr:b, r12:b // lr = z4
  4243. +
  4244. + add r4, r5
  4245. + add lr, r5
  4246. +
  4247. + addhh.w r5, r2:b, r1:b // r5 = z2
  4248. + addhh.w r8, r3:b, r0:b // r8 = z1
  4249. +
  4250. +
  4251. + mulhh.w r0, r0:b, r9:t // r0 = tmp0
  4252. + ld.w r12, pc[coef_table_copy - . + 16]
  4253. + mulhh.w r1, r1:b, r9:b // r1 = tmp1
  4254. + ld.w r9, pc[coef_table_copy - . + 20]
  4255. + mulhh.w r2, r2:b, r12:t // r2 = tmp2
  4256. + mulhh.w r3, r3:b, r12:b // r3 = tmp3
  4257. + mulhh.w r8, r8:b, r9:t // r8 = z1
  4258. + mulhh.w r5, r5:b, r9:b // r5 = z2
  4259. +
  4260. +
  4261. + add r0, r8
  4262. + add r0, r4
  4263. + add r1, r5
  4264. + add r1, lr
  4265. + add r2, r5
  4266. + add r2, r4
  4267. + add r3, r8
  4268. + add r3, lr
  4269. +
  4270. + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
  4271. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  4272. + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
  4273. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  4274. +
  4275. + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
  4276. + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
  4277. + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
  4278. + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
  4279. +
  4280. + sthh.w r10[0], r4:t, r5:t
  4281. + sthh.w r10[4], r3:t, r2:t
  4282. + sthh.w r10[8], r2:b, r3:b
  4283. + sthh.w r10[12], r5:b, r4:b
  4284. +
  4285. +
  4286. +
  4287. + sub r10, -16
  4288. + sub loop_cnt, 1
  4289. + brne 0b
  4290. +
  4291. +2:
  4292. +
  4293. + sub r10, 128 //Set pointer to start of DCT block
  4294. +
  4295. + mov loop_cnt, 8
  4296. +
  4297. +0:
  4298. + ldins.h r3:t,r10[0] // r3:t = dataptr[0]
  4299. + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
  4300. + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
  4301. + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
  4302. + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
  4303. + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
  4304. + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
  4305. + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
  4306. +
  4307. + or r4, r1, r3 << 16
  4308. + or r4, r2
  4309. + or r4, r0
  4310. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4311. +
  4312. + lddsp r12, SP[0] // rfp
  4313. + lddsp r9, SP[4] // iinc
  4314. + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
  4315. + packw.sh r3, r3, r3
  4316. + packsh.ub r3, r3, r3
  4317. + mov r2, r3
  4318. + st.d r12[0], r2
  4319. + add r12, r9 // increment rfp
  4320. + sub r10, -2 // Increment the dataptr
  4321. + stdsp SP[0], r12
  4322. +
  4323. + sub loop_cnt, 1//Decrement loop counter
  4324. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4325. +
  4326. + sub sp, -8
  4327. + popm r0-r3, r4-r7, pc//Pop back registers and PC
  4328. +
  4329. +1:
  4330. +
  4331. + ld.w r12, pc[coef_table_copy - .]
  4332. + ld.w r9, pc[coef_table_copy - . + 4]
  4333. +
  4334. + addhh.w r4, r2:t, r2:b
  4335. + mulhh.w r4, r4:b, r12:t // r4 = z1
  4336. + mulhh.w r5, r2:b, r12:b
  4337. + ld.w r12, pc[coef_table_copy - . + 8]
  4338. + mulhh.w r6, r2:t, r9:t
  4339. + add r5, r4 // r5 = tmp2
  4340. + add r6, r4 // r6 = tmp3
  4341. +
  4342. + addhh.w r7, r3:t, r3:b
  4343. + subhh.w r8, r3:t, r3:b
  4344. +
  4345. + lsl r7, CONST_BITS
  4346. + lsl r8, CONST_BITS
  4347. +
  4348. + add r2, r7, r6 // r2 = tmp10
  4349. + sub r3, r7, r6 // r3 = tmp13
  4350. + add r4, r8, r5 // r4 = tmp11
  4351. + sub r5, r8, r5 // r5 = tmp12
  4352. +
  4353. +
  4354. + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
  4355. + addhh.w r7, r6:t, r6:b
  4356. + mulhh.w r7, r7:b, r9:b // r7 = z5
  4357. +
  4358. + ld.w r9, pc[coef_table_copy - . + 12]
  4359. + mulhh.w r8, r6:b, r12:t // r8 = z3
  4360. + mulhh.w r6, r6:t, r12:b // r6 = z4
  4361. +
  4362. + add r8, r7
  4363. + add r6, r7
  4364. +
  4365. + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
  4366. +
  4367. + mulhh.w r12, r0:b, r9:t // r12 = tmp0
  4368. + mulhh.w r0, r0:t, r9:b // r0 = tmp1
  4369. + ld.w r9, pc[coef_table_copy - . + 16]
  4370. + add r12, r8
  4371. + add r0, r6
  4372. +
  4373. + ld.w lr, pc[coef_table_copy - . + 20]
  4374. + machh.w r8, r1:b, r9:t // r8 = tmp2
  4375. + machh.w r6, r1:t, r9:b // r6 = tmp3
  4376. + mulhh.w r9, r7:b, lr:t // r9 = z1
  4377. + mulhh.w r7, r7:t, lr:b // r7 = z2
  4378. +
  4379. +
  4380. + add r12, r9
  4381. + add r0, r7
  4382. + add r8, r7
  4383. + add r6, r9
  4384. +
  4385. + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
  4386. + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
  4387. + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
  4388. + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
  4389. + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
  4390. + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
  4391. + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
  4392. + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
  4393. +
  4394. + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
  4395. + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
  4396. + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
  4397. + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
  4398. + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
  4399. + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
  4400. + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
  4401. + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
  4402. +
  4403. + packw.sh r1, r1, r6
  4404. + packw.sh r8, r8, r0
  4405. + packw.sh r3, r3, r5
  4406. + packw.sh r4, r4, r2
  4407. +
  4408. + packsh.ub r1, r1, r8
  4409. + packsh.ub r0, r3, r4
  4410. + lddsp r12, SP[0] // rfp
  4411. + lddsp r9, SP[4] // iinc
  4412. + st.d r12[0], r0
  4413. + sub r10, -2 // Increment the dataptr
  4414. + add r12, r9 // increment rfp
  4415. + stdsp SP[0], r12
  4416. +
  4417. + sub loop_cnt, 1 //Decrement loop counter
  4418. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4419. +
  4420. + sub sp, -8
  4421. + popm r0-r3, r4-r7, pc //Pop back registers and PC
  4422. +
  4423. +
  4424. +
  4425. + .align 2
  4426. +coef_table_copy:
  4427. + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
  4428. + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
  4429. + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
  4430. +
  4431. +
  4432. +idct_avr32:
  4433. + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
  4434. +
  4435. + //; Give room for a temporary block on the stack
  4436. + sub sp, 8*8*2
  4437. +
  4438. + mov loop_cnt, 8 //Initialize loop counter
  4439. +
  4440. +0:
  4441. +
  4442. + ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
  4443. + mov r6, 0
  4444. +#ifdef USE_PREFETCH
  4445. + pref r12[LINE_SIZE] //Prefetch next line
  4446. +#endif
  4447. + or r4, r2, r3 << 16
  4448. + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
  4449. + or r4, r0
  4450. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4451. +
  4452. + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
  4453. + plsl.h r5, r5, PASS1_BITS
  4454. + mov r4, r5
  4455. + st.d sp++, r4
  4456. + st.d sp++, r4
  4457. +
  4458. + sub loop_cnt, 1 //Decrement loop counter
  4459. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4460. +
  4461. + bral 2f //Perform coloumn transform after row transform is computed
  4462. +
  4463. +1:
  4464. +
  4465. + ld.w r10, pc[coef_table_idct - .]
  4466. + ld.w r9, pc[coef_table_idct - . + 4]
  4467. +
  4468. + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
  4469. + mulhh.w r5, r4:t, r10:t
  4470. + mulhh.w r6, r0:t, r10:b
  4471. + ld.w r10, pc[coef_table_idct - . + 8]
  4472. + mulhh.w r7, r2:t, r9:t
  4473. + add r6, r5 // tmp2
  4474. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  4475. + add r7, r5 // tmp3
  4476. + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
  4477. +
  4478. + paddsub.h r5, r3:t, r1:t
  4479. + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
  4480. +
  4481. + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
  4482. + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
  4483. +
  4484. +
  4485. +
  4486. + addhh.w lr, r3:b, r1:b // lr = z4
  4487. + addhh.w r5, r4:b, lr:b
  4488. + mulhh.w r5, r5:b, r9:b // r5 = z5
  4489. +
  4490. + ld.w r9, pc[coef_table_idct - . + 12]
  4491. + mulhh.w r4, r4:b, r10:t // r4 = z3
  4492. + mulhh.w lr, lr:b, r10:b // lr = z4
  4493. +
  4494. + add r4, r5
  4495. + add lr, r5
  4496. +
  4497. + addhh.w r5, r2:b, r1:b // r5 = z2
  4498. + addhh.w r8, r3:b, r0:b // r8 = z1
  4499. +
  4500. +
  4501. + mulhh.w r0, r0:b, r9:t // r0 = tmp0
  4502. + ld.w r10, pc[coef_table_idct - . + 16]
  4503. + mulhh.w r1, r1:b, r9:b // r1 = tmp1
  4504. + ld.w r9, pc[coef_table_idct - . + 20]
  4505. + mulhh.w r2, r2:b, r10:t // r2 = tmp2
  4506. + mulhh.w r3, r3:b, r10:b // r3 = tmp3
  4507. + mulhh.w r8, r8:b, r9:t // r8 = z1
  4508. + mulhh.w r5, r5:b, r9:b // r5 = z2
  4509. +
  4510. +
  4511. + add r0, r8
  4512. + add r0, r4
  4513. + add r1, r5
  4514. + add r1, lr
  4515. + add r2, r5
  4516. + add r2, r4
  4517. + add r3, r8
  4518. + add r3, lr
  4519. +
  4520. + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
  4521. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  4522. + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
  4523. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  4524. +
  4525. + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
  4526. + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
  4527. + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
  4528. + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
  4529. +
  4530. + sthh.w sp[0], r4:t, r5:t
  4531. + sthh.w sp[4], r3:t, r2:t
  4532. + sthh.w sp[8], r2:b, r3:b
  4533. + sthh.w sp[12], r5:b, r4:b
  4534. +
  4535. +
  4536. +
  4537. + sub sp, -16
  4538. + sub loop_cnt, 1
  4539. + brne 0b
  4540. +
  4541. +2:
  4542. +
  4543. + sub sp, 8*8*2 //Set pointer to start of DCT block
  4544. + sub r12, 8*8*2 //Set pointer to start of DCT block
  4545. +
  4546. + mov loop_cnt, 8
  4547. +
  4548. +0:
  4549. + ldins.h r3:t,sp[0] // r3:t = dataptr[0]
  4550. + ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1]
  4551. + ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2]
  4552. + ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5]
  4553. + ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4]
  4554. + ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3]
  4555. + ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6]
  4556. + ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7]
  4557. +
  4558. + or r4, r1, r3 << 16
  4559. + or r4, r2
  4560. + or r4, r0
  4561. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4562. +
  4563. + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
  4564. + packw.sh r3, r3, r3
  4565. + mov r2, r3
  4566. + st.d r12++, r2
  4567. + st.d r12++, r2
  4568. + sub sp, -2 // Increment the dataptr
  4569. +
  4570. + sub loop_cnt, 1//Decrement loop counter
  4571. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4572. +
  4573. + sub sp, -(8*8*2 - 8)
  4574. + popm r0-r3, r4-r7, pc//Pop back registers and PC
  4575. +
  4576. +1:
  4577. +
  4578. + ld.w r10, pc[coef_table_idct - .]
  4579. + ld.w r9, pc[coef_table_idct - . + 4]
  4580. +
  4581. + addhh.w r4, r2:t, r2:b
  4582. + mulhh.w r4, r4:b, r10:t // r4 = z1
  4583. + mulhh.w r5, r2:b, r10:b
  4584. + ld.w r10, pc[coef_table_idct - . + 8]
  4585. + mulhh.w r6, r2:t, r9:t
  4586. + add r5, r4 // r5 = tmp2
  4587. + add r6, r4 // r6 = tmp3
  4588. +
  4589. + addhh.w r7, r3:t, r3:b
  4590. + subhh.w r8, r3:t, r3:b
  4591. +
  4592. + lsl r7, CONST_BITS
  4593. + lsl r8, CONST_BITS
  4594. +
  4595. + add r2, r7, r6 // r2 = tmp10
  4596. + sub r3, r7, r6 // r3 = tmp13
  4597. + add r4, r8, r5 // r4 = tmp11
  4598. + sub r5, r8, r5 // r5 = tmp12
  4599. +
  4600. +
  4601. + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
  4602. + addhh.w r7, r6:t, r6:b
  4603. + mulhh.w r7, r7:b, r9:b // r7 = z5
  4604. +
  4605. + ld.w r9, pc[coef_table_idct - . + 12]
  4606. + mulhh.w r8, r6:b, r10:t // r8 = z3
  4607. + mulhh.w r6, r6:t, r10:b // r6 = z4
  4608. +
  4609. + add r8, r7
  4610. + add r6, r7
  4611. +
  4612. + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
  4613. +
  4614. + mulhh.w r10, r0:b, r9:t // r10 = tmp0
  4615. + mulhh.w r0, r0:t, r9:b // r0 = tmp1
  4616. + ld.w r9, pc[coef_table_idct - . + 16]
  4617. + add r10, r8
  4618. + add r0, r6
  4619. +
  4620. + ld.w lr, pc[coef_table_idct - . + 20]
  4621. + machh.w r8, r1:b, r9:t // r8 = tmp2
  4622. + machh.w r6, r1:t, r9:b // r6 = tmp3
  4623. + mulhh.w r9, r7:b, lr:t // r9 = z1
  4624. + mulhh.w r7, r7:t, lr:b // r7 = z2
  4625. +
  4626. +
  4627. + add r10, r9
  4628. + add r0, r7
  4629. + add r8, r7
  4630. + add r6, r9
  4631. +
  4632. + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
  4633. + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
  4634. + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
  4635. + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
  4636. + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
  4637. + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
  4638. + add r0, r3, r10 // r0 = dataptr[DCTSIZE*3]
  4639. + sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4]
  4640. +
  4641. + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
  4642. + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
  4643. + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
  4644. + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
  4645. + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
  4646. + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
  4647. + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
  4648. + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
  4649. +
  4650. + packw.sh r7, r1, r6
  4651. + packw.sh r6, r8, r0
  4652. + packw.sh r5, r3, r5
  4653. + packw.sh r4, r4, r2
  4654. +
  4655. + stm r12, r4-r7
  4656. + sub sp, -2 // Increment the dataptr
  4657. + sub r12, -16
  4658. +
  4659. + sub loop_cnt, 1 //Decrement loop counter
  4660. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4661. +
  4662. + sub sp, -(8*8*2 - 8)
  4663. + popm r0-r3, r4-r7, pc //Pop back registers and PC
  4664. +
  4665. +
  4666. +
  4667. + .align 2
  4668. +coef_table_idct:
  4669. + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
  4670. + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
  4671. + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
  4672. +
  4673. diff --git a/libavcodec/avr32/mc.S b/libavcodec/avr32/mc.S
  4674. new file mode 100644
  4675. index 0000000..07a002d
  4676. --- /dev/null
  4677. +++ b/libavcodec/avr32/mc.S
  4678. @@ -0,0 +1,434 @@
  4679. +/*
  4680. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  4681. + *
  4682. + * Redistribution and use in source and binary forms, with or without
  4683. + * modification, are permitted provided that the following conditions
  4684. + * are met:
  4685. + *
  4686. + * 1. Redistributions of source code must retain the above copyright
  4687. + * notice, this list of conditions and the following disclaimer.
  4688. + *
  4689. + * 2. Redistributions in binary form must reproduce the above
  4690. + * copyright notice, this list of conditions and the following
  4691. + * disclaimer in the documentation and/or other materials provided
  4692. + * with the distribution.
  4693. + *
  4694. + * 3. The name of ATMEL may not be used to endorse or promote products
  4695. + * derived from this software without specific prior written
  4696. + * permission.
  4697. + *
  4698. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  4699. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  4700. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  4701. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  4702. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  4703. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  4704. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  4705. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  4706. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  4707. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  4708. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  4709. + * DAMAGE.
  4710. + */
  4711. +
  4712. +
  4713. + /* Macro for masking the lowest bit of each byte in a
  4714. + packed word */
  4715. + .macro packedmask1 reg, round
  4716. + .if \round
  4717. + and \reg, \reg, r8 >> 1
  4718. + .else
  4719. + and \reg, r8
  4720. + .endif
  4721. + .endm
  4722. +
  4723. + /* Macro for 8 pixel wide horizontal and vertical interpolation functions */
  4724. + .macro pixels8_hv round, put
  4725. +
  4726. +
  4727. + pushm r0-r7, lr
  4728. +
  4729. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4730. +
  4731. + /* Rounding immediate */
  4732. + .if \round
  4733. + mov r8, lo(0x02020202)
  4734. + orh r8, hi(0x02020202)
  4735. + .else
  4736. + mov r8, lo(0x01010101)
  4737. + orh r8, hi(0x01010101)
  4738. + .endif
  4739. + mov r7, 2
  4740. +
  4741. + /* Pixel naming convention :
  4742. +
  4743. + |-----------------------------------------------------|
  4744. + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 |
  4745. + |----d00---d01---d02---d03---d04---d05---d06---d07----|
  4746. + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 |
  4747. + |-----------------------------------------------------|
  4748. + */
  4749. +1:
  4750. + ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 }
  4751. + ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 }
  4752. + mov lr, r9
  4753. + eor r2, r0, r1
  4754. + packedmask1 r2, \round
  4755. + add r2, r8
  4756. +
  4757. + paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4758. +
  4759. + add r11, r10 // pixels += line_size
  4760. + ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 }
  4761. + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
  4762. +0:
  4763. + eor r5, r1, r3
  4764. + packedmask1 r5, \round
  4765. + add r2, r5
  4766. +
  4767. + paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2}
  4768. + eor r6, r0, r1
  4769. + packedmask1 r6, \round
  4770. + add r2, r2, r6 << 1
  4771. +
  4772. + ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 }
  4773. + add r11, r10 // pixels += line_size
  4774. + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
  4775. +
  4776. + paddh.ub r0, r0, r1
  4777. + plsr.b r2, r2, 2
  4778. + padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 }
  4779. +
  4780. + /* Next row */
  4781. + .if \put
  4782. + eor r2, r3, r4
  4783. + packedmask1 r2, \round
  4784. + add r2, r8
  4785. + .else
  4786. + ld.w r6, r12[0]
  4787. + eor r2, r3, r4
  4788. + packedmask1 r2, \round
  4789. + add r2, r8
  4790. + pavg.ub r0, r0, r6
  4791. + .endif
  4792. + st.w r12[0], r0 // Put data into the block
  4793. +
  4794. + add r5, r2
  4795. + paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4796. +
  4797. + eor r6, r0, r1
  4798. + packedmask1 r6, \round
  4799. + add r5, r5, r6 << 1
  4800. +
  4801. + .if \put
  4802. + paddh.ub r1, r0, r1
  4803. + plsr.b r5, r5, 2
  4804. + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
  4805. + .else
  4806. + ld.w r3, r12[r10]
  4807. + paddh.ub r1, r0, r1
  4808. + plsr.b r5, r5, 2
  4809. + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
  4810. + pavg.ub r1, r1, r3
  4811. + .endif
  4812. +
  4813. + st.w r12[r10], r1 // Put data into the block
  4814. +
  4815. +
  4816. + ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 }
  4817. + add r11, r10 // pixels += line_size
  4818. + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
  4819. + add r12, r12, r10 << 1 // block += 2*line_size
  4820. + sub lr, 2
  4821. + brne 0b
  4822. +
  4823. + mul r0, r10, r9 // r0 = line_size * h
  4824. + rsub r0, r0, 4 // r0 = 4 - (line_size * h)
  4825. + add r11, r0
  4826. + sub r11, r10 // pixels += 4 - (line_size * (h+1))
  4827. + add r12, r0 // pixels += 4 - (line_size * (h))
  4828. + sub r7, 1
  4829. + brne 1b
  4830. +
  4831. + popm r0-r7, pc
  4832. + .endm
  4833. +
  4834. +
  4835. + /* Macro for 8 pixel wide vertical interpolation functions */
  4836. +
  4837. + .macro pixels8_v round, put
  4838. + pushm r4-r7,lr
  4839. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4840. +
  4841. + /*
  4842. + Pixel Naming Convention :
  4843. + |-----------------------------------------------|
  4844. + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 |
  4845. + |-d00---d01---d02---d03---d04---d05---d06---d07-|
  4846. + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 |
  4847. + |-----------------------------------------------|
  4848. + */
  4849. + ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 }
  4850. + ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4
  4851. + ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 }
  4852. + ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 }
  4853. + sub r10, 4 // stride -= 4
  4854. + add r11, r11, r10 << 1 // src += 2*stride
  4855. + sub r11, -4 // src += 4
  4856. +
  4857. +0:
  4858. + .if \round
  4859. + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4860. + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4861. + .else
  4862. + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4863. + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4864. + .endif
  4865. +
  4866. + .if \put
  4867. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4868. + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
  4869. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4870. + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
  4871. + .else
  4872. + ld.w lr, r12[0]
  4873. + ld.w r7, r12[4]
  4874. + pavg.ub r5, r5, lr
  4875. + pavg.ub r4, r4, r7
  4876. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4877. + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
  4878. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4879. + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
  4880. + .endif
  4881. + add r11, r10 // src += stride
  4882. +#ifdef USE_PREFETCH
  4883. + pref r11[0]
  4884. +#endif
  4885. + add r12, r10 // dst += stride
  4886. +
  4887. + .if \round
  4888. + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4889. + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4890. + .else
  4891. + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4892. + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4893. + .endif
  4894. + .if \put
  4895. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4896. + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
  4897. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4898. + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
  4899. + .else
  4900. + ld.w r8, r12[0]
  4901. + ld.w r6, r12[4]
  4902. + pavg.ub r5, r5, r8
  4903. + pavg.ub r4, r4, r6
  4904. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4905. + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
  4906. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4907. + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
  4908. + .endif
  4909. +
  4910. + add r11, r10 // src += stride
  4911. +#ifdef USE_PREFETCH
  4912. + pref r11[0]
  4913. +#endif
  4914. + add r12, r10 // dst += stride
  4915. + sub r9, 2
  4916. + brne 0b
  4917. +
  4918. + popm r4-r7,pc
  4919. + .endm
  4920. +
  4921. + /* Macro for 8 pixel wide horizontal interpolation functions */
  4922. +
  4923. + .macro pixels8_h round, put
  4924. + pushm r4-r7, lr
  4925. +
  4926. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4927. + /*
  4928. + Pixel Naming Convention:
  4929. + |--------------------------------------------------------------------|
  4930. + | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08|
  4931. + |------|-------|-------|-------|-------|-------|-------|-------|-----|
  4932. + | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18|
  4933. + |--------------------------------------------------------------------|
  4934. + */
  4935. +
  4936. + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
  4937. + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
  4938. + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
  4939. + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
  4940. + add r11, r10 // src += stride
  4941. +
  4942. +0:
  4943. + .if \round
  4944. + pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4945. + pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4946. + .else
  4947. + paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4948. + paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4949. + .endif
  4950. + .if \put
  4951. + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
  4952. + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
  4953. + .else
  4954. + ld.w r8, r12[0]
  4955. + ld.w r6, r12[4]
  4956. + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
  4957. + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
  4958. + pavg.ub lr, lr, r8
  4959. + pavg.ub r7, r7, r6
  4960. + .endif
  4961. + st.w r12[0], lr // dst = { d00, d01, d02, d03 }
  4962. + st.w r12[4], r7 // dst = { d04, d05, d06, d07 }
  4963. + ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 }
  4964. + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
  4965. + add r11, r10 // src += stride
  4966. +#ifdef USE_PREFETCH
  4967. + pref r11[0]
  4968. +#endif
  4969. + add r12, r10 // dst += stride
  4970. +
  4971. + .if \round
  4972. + pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4973. + pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4974. + .else
  4975. + paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4976. + paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4977. + .endif
  4978. + .if \put
  4979. + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
  4980. + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
  4981. + .else
  4982. + ld.w r7, r12[0]
  4983. + ld.w r6, r12[4]
  4984. + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
  4985. + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
  4986. + pavg.ub r5, r5, r7
  4987. + pavg.ub r4, r4, r6
  4988. + .endif
  4989. + st.w r12[0], r5 // dst = { d00, d01, d02, d03 }
  4990. + st.w r12[4], r4 // dst = { d04, d05, d06, d07 }
  4991. + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
  4992. + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
  4993. + add r11, r10 // src += stride
  4994. +#ifdef USE_PREFETCH
  4995. + pref r11[0]
  4996. +#endif
  4997. + add r12, r10 // dst += stride
  4998. + sub r9, 2
  4999. + brne 0b
  5000. +
  5001. + popm r4-r7, pc
  5002. + .endm
  5003. +
  5004. + /* Macro for 8 pixel wide copy functions */
  5005. + .macro pixels8 put
  5006. + stm --sp, r3-r7,lr
  5007. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  5008. + mov lr, r9
  5009. + sub r3, r10, 2 // stride2 = stride - 2
  5010. +0:
  5011. + .if \put
  5012. + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
  5013. + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
  5014. + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
  5015. + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
  5016. + .else
  5017. + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
  5018. + ld.d r4, r12[0]
  5019. + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
  5020. + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
  5021. + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
  5022. + pavg.ub r6, r6, r4
  5023. + pavg.ub r7, r7, r5
  5024. + ld.d r4, r12[r10]
  5025. + .endif
  5026. + st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 }
  5027. + add r11, r11, r3 << 1 // src += stride2 * 2
  5028. + .ifeq \put
  5029. + pavg.ub r8, r8, r4
  5030. + pavg.ub r9, r9, r5
  5031. + .endif
  5032. + st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 }
  5033. + add r12, r12, r10 << 1 // dst += 2*stride
  5034. + sub lr, 2
  5035. + brne 0b
  5036. + ldm sp++, r3-r7,pc
  5037. +
  5038. + .endm
  5039. +
  5040. + .global put_no_rnd_pixels8_hv_avr32
  5041. + .text
  5042. +put_no_rnd_pixels8_hv_avr32:
  5043. + pixels8_hv 0, 1
  5044. +
  5045. + .global put_pixels8_hv_avr32
  5046. + .text
  5047. +put_pixels8_hv_avr32:
  5048. + pixels8_hv 1, 1
  5049. +
  5050. + .global avg_no_rnd_pixels8_hv_avr32
  5051. + .text
  5052. +avg_no_rnd_pixels8_hv_avr32:
  5053. + pixels8_hv 0, 0
  5054. +
  5055. + .global avg_pixels8_hv_avr32
  5056. + .text
  5057. +avg_pixels8_hv_avr32:
  5058. + pixels8_hv 1, 0
  5059. +
  5060. + .global put_no_rnd_pixels8_v_avr32
  5061. + .text
  5062. +put_no_rnd_pixels8_v_avr32:
  5063. + pixels8_v 0, 1
  5064. +
  5065. + .global put_pixels8_v_avr32
  5066. + .text
  5067. +put_pixels8_v_avr32:
  5068. + pixels8_v 1, 1
  5069. +
  5070. + .global avg_no_rnd_pixels8_v_avr32
  5071. + .text
  5072. +avg_no_rnd_pixels8_v_avr32:
  5073. + pixels8_v 0, 0
  5074. +
  5075. + .global avg_pixels8_v_avr32
  5076. + .text
  5077. +avg_pixels8_v_avr32:
  5078. + pixels8_v 1, 0
  5079. +
  5080. + .global put_no_rnd_pixels8_h_avr32
  5081. + .text
  5082. +put_no_rnd_pixels8_h_avr32:
  5083. + pixels8_h 0, 1
  5084. +
  5085. + .global put_pixels8_h_avr32
  5086. + .text
  5087. +put_pixels8_h_avr32:
  5088. + pixels8_h 1, 1
  5089. +
  5090. + .global avg_no_rnd_pixels8_h_avr32
  5091. + .text
  5092. +avg_no_rnd_pixels8_h_avr32:
  5093. + pixels8_h 0, 0
  5094. +
  5095. + .global avg_pixels8_h_avr32
  5096. + .text
  5097. +avg_pixels8_h_avr32:
  5098. + pixels8_h 1, 0
  5099. +
  5100. + .global put_pixels8_avr32
  5101. + .global put_no_rnd_pixels8_avr32
  5102. + .text
  5103. +put_pixels8_avr32:
  5104. +put_no_rnd_pixels8_avr32:
  5105. + pixels8 1
  5106. +
  5107. + .global avg_no_rnd_pixels8_avr32
  5108. + .global avg_pixels8_avr32
  5109. + .text
  5110. +avg_pixels8_avr32:
  5111. +avg_no_rnd_pixels8_avr32:
  5112. + pixels8 0
  5113. diff --git a/libavcodec/avr32/pico.h b/libavcodec/avr32/pico.h
  5114. new file mode 100644
  5115. index 0000000..32201ba
  5116. --- /dev/null
  5117. +++ b/libavcodec/avr32/pico.h
  5118. @@ -0,0 +1,260 @@
  5119. +/*
  5120. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  5121. + *
  5122. + * Redistribution and use in source and binary forms, with or without
  5123. + * modification, are permitted provided that the following conditions
  5124. + * are met:
  5125. + *
  5126. + * 1. Redistributions of source code must retain the above copyright
  5127. + * notice, this list of conditions and the following disclaimer.
  5128. + *
  5129. + * 2. Redistributions in binary form must reproduce the above
  5130. + * copyright notice, this list of conditions and the following
  5131. + * disclaimer in the documentation and/or other materials provided
  5132. + * with the distribution.
  5133. + *
  5134. + * 3. The name of ATMEL may not be used to endorse or promote products
  5135. + * derived from this software without specific prior written
  5136. + * permission.
  5137. + *
  5138. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  5139. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  5140. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  5141. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  5142. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  5143. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  5144. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  5145. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  5146. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  5147. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  5148. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  5149. + * DAMAGE.
  5150. + */
  5151. +#ifndef __PICO_H__
  5152. +#define __PICO_H__
  5153. +
  5154. +
  5155. +
  5156. +/* Coprocessor Number */
  5157. +#define PICO_CPNO 1
  5158. +
  5159. +/* Pixel Coprocessor Register file */
  5160. +#define PICO_REGVECT_INPIX2 cr0
  5161. +#define PICO_REGVECT_INPIX1 cr1
  5162. +#define PICO_REGVECT_INPIX0 cr2
  5163. +#define PICO_REGVECT_OUTPIX2 cr3
  5164. +#define PICO_REGVECT_OUTPIX1 cr4
  5165. +#define PICO_REGVECT_OUTPIX0 cr5
  5166. +#define PICO_REGVECT_COEFF0_A cr6
  5167. +#define PICO_REGVECT_COEFF0_B cr7
  5168. +#define PICO_REGVECT_COEFF1_A cr8
  5169. +#define PICO_REGVECT_COEFF1_B cr9
  5170. +#define PICO_REGVECT_COEFF2_A cr10
  5171. +#define PICO_REGVECT_COEFF2_B cr11
  5172. +#define PICO_REGVECT_VMU0_OUT cr12
  5173. +#define PICO_REGVECT_VMU1_OUT cr13
  5174. +#define PICO_REGVECT_VMU2_OUT cr14
  5175. +#define PICO_REGVECT_CONFIG cr15
  5176. +
  5177. +#define PICO_INPIX2 0
  5178. +#define PICO_INPIX1 1
  5179. +#define PICO_INPIX0 2
  5180. +#define PICO_OUTPIX2 3
  5181. +#define PICO_OUTPIX1 4
  5182. +#define PICO_OUTPIX0 5
  5183. +#define PICO_COEFF0_A 6
  5184. +#define PICO_COEFF0_B 7
  5185. +#define PICO_COEFF1_A 8
  5186. +#define PICO_COEFF1_B 9
  5187. +#define PICO_COEFF2_A 10
  5188. +#define PICO_COEFF2_B 11
  5189. +#define PICO_VMU0_OUT 12
  5190. +#define PICO_VMU1_OUT 13
  5191. +#define PICO_VMU2_OUT 14
  5192. +#define PICO_CONFIG 15
  5193. +
  5194. +/* Config Register */
  5195. +#define PICO_COEFF_FRAC_BITS_OFFSET 0
  5196. +#define PICO_COEFF_FRAC_BITS_SIZE 4
  5197. +#define PICO_OFFSET_FRAC_BITS_OFFSET 4
  5198. +#define PICO_OFFSET_FRAC_BITS_SIZE 4
  5199. +#define PICO_INPUT_MODE_OFFSET 8
  5200. +#define PICO_INPUT_MODE_SIZE 2
  5201. +#define PICO_OUTPUT_MODE_OFFSET 10
  5202. +#define PICO_OUTPUT_MODE_SIZE 1
  5203. +
  5204. +struct pico_config_t {
  5205. + unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE;
  5206. + unsigned int output_mode : PICO_OUTPUT_MODE_SIZE;
  5207. + unsigned int input_mode : PICO_INPUT_MODE_SIZE;
  5208. + unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE;
  5209. + unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE;
  5210. + int vmu2_out;
  5211. + int vmu1_out;
  5212. + int vmu0_out;
  5213. + short coeff2_2;
  5214. + short coeff2_3;
  5215. + short coeff2_0;
  5216. + short coeff2_1;
  5217. + short coeff1_2;
  5218. + short coeff1_3;
  5219. + short coeff1_0;
  5220. + short coeff1_1;
  5221. + short coeff0_2;
  5222. + short coeff0_3;
  5223. + short coeff0_0;
  5224. + short coeff0_1;
  5225. +};
  5226. +
  5227. +
  5228. +#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET)
  5229. +#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET)
  5230. +#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET)
  5231. +#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET)
  5232. +
  5233. +#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1))
  5234. +#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1))
  5235. +#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1))
  5236. +#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1))
  5237. +
  5238. +enum pico_input_mode { PICO_TRANSFORMATION_MODE,
  5239. + PICO_HOR_FILTER_MODE,
  5240. + PICO_VERT_FILTER_MODE };
  5241. +
  5242. +enum pico_output_mode { PICO_PACKED_MODE,
  5243. + PICO_PLANAR_MODE };
  5244. +
  5245. +/* Bits in coefficients */
  5246. +#define PICO_COEFF_BITS 12
  5247. +
  5248. +/* Operation bits */
  5249. +#define PICO_MATRIX (0)
  5250. +#define PICO_USE_ACC (1 << 2)
  5251. +#define PICO_SINGLE_VECTOR (1 << 3)
  5252. +
  5253. +
  5254. +#define __str(x...) #x
  5255. +#define __xstr(x...) __str(x)
  5256. +
  5257. +#define PICO_PUT_W(pico_reg, x) \
  5258. + __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
  5259. +#define PICO_GET_W(pico_reg) \
  5260. + __builtin_mvcr_w(PICO_CPNO, pico_reg)
  5261. +
  5262. +#define PICO_MVCR_W(x, pico_reg) \
  5263. + asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
  5264. +
  5265. +#define PICO_MVRC_W(pico_reg, x) \
  5266. + asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
  5267. +
  5268. +#define PICO_PUT_D(pico_reg, x) \
  5269. + __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
  5270. +#define PICO_GET_D(pico_reg) \
  5271. + __builtin_mvcr_d(PICO_CPNO, pico_reg)
  5272. +
  5273. +#define PICO_MVCR_D(x, pico_reg) \
  5274. + asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
  5275. +#define PICO_MVRC_D(pico_reg, x) \
  5276. + asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
  5277. +
  5278. +#define PICO_STCM_W(ptr, pico_regs...) \
  5279. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5280. +#define PICO_STCM_D(ptr, pico_regs...) \
  5281. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5282. +
  5283. +#define PICO_STCM_W_DEC(ptr, pico_regs...) \
  5284. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5285. +#define PICO_STCM_D_DEC(ptr, pico_regs...) \
  5286. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5287. +
  5288. +#define PICO_LDCM_W(ptr, pico_regs...) \
  5289. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5290. +#define PICO_LDCM_D(ptr, pico_regs...) \
  5291. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5292. +
  5293. +#define PICO_LDCM_W_INC(ptr, pico_regs...) \
  5294. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5295. +#define PICO_LDCM_D_INC(ptr, pico_regs...) \
  5296. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5297. +
  5298. +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
  5299. + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
  5300. +
  5301. +static inline void set_pico_config(struct pico_config_t *config){
  5302. + PICO_LDCM_D(config,
  5303. + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
  5304. + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
  5305. + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
  5306. + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
  5307. + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
  5308. +}
  5309. +
  5310. +static inline void get_pico_config(struct pico_config_t *config){
  5311. + PICO_STCM_D(config,
  5312. + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
  5313. + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
  5314. + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
  5315. + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
  5316. + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
  5317. +}
  5318. +
  5319. +static inline void dump_pico_config(){
  5320. + struct pico_config_t pico_config;
  5321. + char *input_mode, *output_mode;
  5322. + get_pico_config(&pico_config);
  5323. +
  5324. +
  5325. + av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n");
  5326. + av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits);
  5327. + av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits);
  5328. +
  5329. + switch ( pico_config.input_mode ){
  5330. + case PICO_TRANSFORMATION_MODE:
  5331. + input_mode = "Transformation Mode";
  5332. + break;
  5333. + case PICO_HOR_FILTER_MODE:
  5334. + input_mode = "Horisontal Filter Mode";
  5335. + break;
  5336. + case PICO_VERT_FILTER_MODE:
  5337. + input_mode = "Vertical Filter Mode";
  5338. + break;
  5339. + default:
  5340. + input_mode = "Unknown Mode!!";
  5341. + break;
  5342. + }
  5343. + av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode);
  5344. +
  5345. + switch ( pico_config.output_mode ){
  5346. + case PICO_PLANAR_MODE:
  5347. + output_mode = "Planar Mode";
  5348. + break;
  5349. + case PICO_PACKED_MODE:
  5350. + output_mode = "Packed Mode";
  5351. + break;
  5352. + default:
  5353. + output_mode = "Unknown Mode!!";
  5354. + break;
  5355. + }
  5356. +
  5357. + av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode);
  5358. +
  5359. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits));
  5360. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits));
  5361. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits));
  5362. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits));
  5363. +
  5364. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits));
  5365. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits));
  5366. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits));
  5367. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits));
  5368. +
  5369. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits));
  5370. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits));
  5371. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits));
  5372. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits));
  5373. +}
  5374. +
  5375. +
  5376. +
  5377. +#endif
  5378. +
  5379. diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h
  5380. index 26b4f8d..1f8fabf 100644
  5381. --- a/libavcodec/bitstream.h
  5382. +++ b/libavcodec/bitstream.h
  5383. @@ -171,7 +171,7 @@ typedef struct RL_VLC_ELEM {
  5384. #endif
  5385. /* used to avoid missaligned exceptions on some archs (alpha, ...) */
  5386. -#if defined(ARCH_X86) || defined(ARCH_X86_64)
  5387. +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_AVR32)
  5388. # define unaligned16(a) (*(const uint16_t*)(a))
  5389. # define unaligned32(a) (*(const uint32_t*)(a))
  5390. # define unaligned64(a) (*(const uint64_t*)(a))
  5391. @@ -813,6 +813,44 @@ void free_vlc(VLC *vlc);
  5392. * if the vlc code is invalid and max_depth>1 than the number of bits removed
  5393. * is undefined
  5394. */
  5395. +
  5396. +#if defined(ARCH_AVR32)
  5397. +#define GET_VLC(code, name, gb, table, bits, max_depth)\
  5398. +{\
  5399. + int n, index, nb_bits;\
  5400. + union { VLC_TYPE vlc[2];\
  5401. + uint32_t u32; } table_elem;\
  5402. +\
  5403. + index= SHOW_UBITS(name, gb, bits);\
  5404. + table_elem.u32 = unaligned32(&table[index]); \
  5405. + code = table_elem.vlc[0];\
  5406. + n = table_elem.vlc[1];\
  5407. +\
  5408. + if(max_depth > 1 && n < 0 ){\
  5409. + LAST_SKIP_BITS(name, gb, bits)\
  5410. + UPDATE_CACHE(name, gb)\
  5411. +\
  5412. + nb_bits = -n;\
  5413. +\
  5414. + index= SHOW_UBITS(name, gb, nb_bits) + code;\
  5415. + table_elem.u32 = unaligned32(&table[index]); \
  5416. + code = table_elem.vlc[0];\
  5417. + n = table_elem.vlc[1];\
  5418. + if(max_depth > 2 && n < 0){\
  5419. + LAST_SKIP_BITS(name, gb, nb_bits)\
  5420. + UPDATE_CACHE(name, gb)\
  5421. +\
  5422. + nb_bits = -n;\
  5423. +\
  5424. + index= SHOW_UBITS(name, gb, nb_bits) + code;\
  5425. + code = table[index][0];\
  5426. + n = table[index][1];\
  5427. + }\
  5428. + }\
  5429. + SKIP_BITS(name, gb, n)\
  5430. +}
  5431. +
  5432. +#else
  5433. #define GET_VLC(code, name, gb, table, bits, max_depth)\
  5434. {\
  5435. int n, index, nb_bits;\
  5436. @@ -821,7 +859,7 @@ void free_vlc(VLC *vlc);
  5437. code = table[index][0];\
  5438. n = table[index][1];\
  5439. \
  5440. - if(max_depth > 1 && n < 0){\
  5441. + if(max_depth > 1 && n < 0 ){\
  5442. LAST_SKIP_BITS(name, gb, bits)\
  5443. UPDATE_CACHE(name, gb)\
  5444. \
  5445. @@ -843,7 +881,38 @@ void free_vlc(VLC *vlc);
  5446. }\
  5447. SKIP_BITS(name, gb, n)\
  5448. }
  5449. +#endif
  5450. +#if defined(ARCH_AVR32)
  5451. +#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
  5452. +{\
  5453. + int n, index, nb_bits;\
  5454. + union { RL_VLC_ELEM vlc;\
  5455. + uint32_t u32; } table_elem;\
  5456. +\
  5457. + index= SHOW_UBITS(name, gb, bits);\
  5458. + table_elem.u32 = unaligned32(&table[index]); \
  5459. + level = table_elem.vlc.level;\
  5460. + n = table_elem.vlc.len;\
  5461. +\
  5462. + if(max_depth > 1 && n < 0 ){\
  5463. + SKIP_BITS(name, gb, bits)\
  5464. + if(need_update){\
  5465. + UPDATE_CACHE(name, gb)\
  5466. + }\
  5467. +\
  5468. + nb_bits = -n;\
  5469. +\
  5470. + index= SHOW_UBITS(name, gb, nb_bits) + level;\
  5471. + table_elem.u32 = unaligned32(&table[index]); \
  5472. + level = table_elem.vlc.level;\
  5473. + n = table_elem.vlc.len;\
  5474. + }\
  5475. + run= table_elem.vlc.run;\
  5476. + SKIP_BITS(name, gb, n)\
  5477. +}
  5478. +
  5479. +#else
  5480. #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
  5481. {\
  5482. int n, index, nb_bits;\
  5483. @@ -852,7 +921,7 @@ void free_vlc(VLC *vlc);
  5484. level = table[index].level;\
  5485. n = table[index].len;\
  5486. \
  5487. - if(max_depth > 1 && n < 0){\
  5488. + if(max_depth > 1 && n < 0 ){\
  5489. SKIP_BITS(name, gb, bits)\
  5490. if(need_update){\
  5491. UPDATE_CACHE(name, gb)\
  5492. @@ -867,7 +936,7 @@ void free_vlc(VLC *vlc);
  5493. run= table[index].run;\
  5494. SKIP_BITS(name, gb, n)\
  5495. }
  5496. -
  5497. +#endif
  5498. /**
  5499. * parses a vlc code, faster then get_vlc()
  5500. diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
  5501. index 56c42b9..8fc10c6 100644
  5502. --- a/libavcodec/dsputil.c
  5503. +++ b/libavcodec/dsputil.c
  5504. @@ -4197,6 +4197,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
  5505. #ifdef ARCH_BFIN
  5506. dsputil_init_bfin(c,avctx);
  5507. #endif
  5508. +#ifdef ARCH_AVR32
  5509. + dsputil_init_avr32(c,avctx);
  5510. +#endif
  5511. for(i=0; i<64; i++){
  5512. if(!c->put_2tap_qpel_pixels_tab[0][i])
  5513. diff --git a/libavcodec/h264.c b/libavcodec/h264.c
  5514. index 865e80a..8f7c3f1 100644
  5515. --- a/libavcodec/h264.c
  5516. +++ b/libavcodec/h264.c
  5517. @@ -3258,7 +3258,12 @@ static void free_tables(H264Context *h){
  5518. static void init_dequant8_coeff_table(H264Context *h){
  5519. int i,q,x;
  5520. +#ifdef ARCH_AVR32
  5521. + const int transpose = 0;
  5522. +#else
  5523. const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
  5524. +#endif
  5525. +
  5526. h->dequant8_coeff[0] = h->dequant8_buffer[0];
  5527. h->dequant8_coeff[1] = h->dequant8_buffer[1];
  5528. @@ -3281,7 +3286,13 @@ static void init_dequant8_coeff_table(H264Context *h){
  5529. static void init_dequant4_coeff_table(H264Context *h){
  5530. int i,j,q,x;
  5531. + // Yes this is ugly as hell....
  5532. +#ifdef ARCH_AVR32
  5533. + const int transpose = 0;
  5534. +#else
  5535. const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
  5536. +#endif
  5537. +
  5538. for(i=0; i<6; i++ ){
  5539. h->dequant4_coeff[i] = h->dequant4_buffer[i];
  5540. for(j=0; j<i; j++){
  5541. @@ -4663,7 +4674,11 @@ static int decode_slice_header(H264Context *h){
  5542. if (MPV_common_init(s) < 0)
  5543. return -1;
  5544. +#ifdef ARCH_AVR32
  5545. + if ( 1 ){
  5546. +#else
  5547. if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
  5548. +#endif
  5549. memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
  5550. memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t));
  5551. }else{
  5552. diff --git a/libavutil/common.h b/libavutil/common.h
  5553. index 3ae5971..7e52b90 100644
  5554. --- a/libavutil/common.h
  5555. +++ b/libavutil/common.h
  5556. @@ -283,23 +283,39 @@ static inline int mid_pred(int a, int b, int c)
  5557. * @param amax maximum value of the clip range
  5558. * @return cliped value
  5559. */
  5560. +#if defined(ARCH_AVR32)
  5561. +#define clip(a, amin, amax) \
  5562. + ({ int __tmp__; \
  5563. + asm ("min\t%0, %1, %2\n" \
  5564. + "max\t%0, %0, %3\n" \
  5565. + : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \
  5566. + __tmp__; })
  5567. +#else
  5568. static inline int clip(int a, int amin, int amax)
  5569. {
  5570. if (a < amin) return amin;
  5571. else if (a > amax) return amax;
  5572. else return a;
  5573. }
  5574. +#endif
  5575. /**
  5576. * clip a signed integer value into the 0-255 range
  5577. * @param a value to clip
  5578. * @return cliped value
  5579. */
  5580. +#if defined(ARCH_AVR32)
  5581. +#define clip_uint8(a) \
  5582. + ({ int __tmp__ = a; \
  5583. + asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \
  5584. + __tmp__; })
  5585. +#else
  5586. static inline uint8_t clip_uint8(int a)
  5587. {
  5588. if (a&(~255)) return (-a)>>31;
  5589. else return a;
  5590. }
  5591. +#endif
  5592. /* math */
  5593. int64_t ff_gcd(int64_t a, int64_t b);
  5594. diff --git a/libavutil/internal.h b/libavutil/internal.h
  5595. index 285d304..a8b0718 100644
  5596. --- a/libavutil/internal.h
  5597. +++ b/libavutil/internal.h
  5598. @@ -210,6 +210,15 @@ if((y)<(x)){\
  5599. }\
  5600. }
  5601. +/* XXX: Hack for uclibc which declares lrintf but does not implement it... */
  5602. +#ifdef ARCH_AVR32
  5603. +#undef HAVE_LRINTF
  5604. +#define HAVE_LRINTF 1
  5605. +#define lrintf(x) rint(x)
  5606. +#define llrint(x) (long long)rint(x)
  5607. +#endif
  5608. +
  5609. +
  5610. #ifndef HAVE_LRINTF
  5611. /* XXX: add ISOC specific test to avoid specific BSD testing. */
  5612. /* better than nothing implementation. */
  5613. diff --git a/libfaad2/common.h b/libfaad2/common.h
  5614. index f809042..6c5fb21 100644
  5615. --- a/libfaad2/common.h
  5616. +++ b/libfaad2/common.h
  5617. @@ -67,7 +67,7 @@ extern "C" {
  5618. /* Use if target platform has address generators with autoincrement */
  5619. //#define PREFER_POINTERS
  5620. -#if defined(_WIN32_WCE) || defined(__arm__)
  5621. +#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__)
  5622. #define FIXED_POINT
  5623. #endif
  5624. diff --git a/libmpcodecs/ad_libmad.c b/libmpcodecs/ad_libmad.c
  5625. index 076359a..51b77fe 100644
  5626. --- a/libmpcodecs/ad_libmad.c
  5627. +++ b/libmpcodecs/ad_libmad.c
  5628. @@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){
  5629. sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2;
  5630. sh->samplerate=this->frame.header.samplerate;
  5631. sh->i_bps=this->frame.header.bitrate/8;
  5632. +#ifdef WORDS_BIGENDIAN
  5633. + sh->sample_format = AF_FORMAT_S16_BE;
  5634. +#else
  5635. + sh->sample_format = AF_FORMAT_S16_LE;
  5636. +#endif
  5637. sh->samplesize=2;
  5638. return 1;
  5639. diff --git a/libswscale/pico-avr32.h b/libswscale/pico-avr32.h
  5640. new file mode 100644
  5641. index 0000000..7ac6200
  5642. --- /dev/null
  5643. +++ b/libswscale/pico-avr32.h
  5644. @@ -0,0 +1,137 @@
  5645. +/*
  5646. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  5647. + *
  5648. + * Redistribution and use in source and binary forms, with or without
  5649. + * modification, are permitted provided that the following conditions
  5650. + * are met:
  5651. + *
  5652. + * 1. Redistributions of source code must retain the above copyright
  5653. + * notice, this list of conditions and the following disclaimer.
  5654. + *
  5655. + * 2. Redistributions in binary form must reproduce the above
  5656. + * copyright notice, this list of conditions and the following
  5657. + * disclaimer in the documentation and/or other materials provided
  5658. + * with the distribution.
  5659. + *
  5660. + * 3. The name of ATMEL may not be used to endorse or promote products
  5661. + * derived from this software without specific prior written
  5662. + * permission.
  5663. + *
  5664. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  5665. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  5666. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  5667. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  5668. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  5669. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  5670. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  5671. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  5672. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  5673. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  5674. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  5675. + * DAMAGE.
  5676. + */
  5677. +#ifndef __PICO_H__
  5678. +#define __PICO_H__
  5679. +
  5680. +/* Coprocessor Number */
  5681. +#define PICO_CPNO 1
  5682. +
  5683. +/* Pixel Coprocessor Register file */
  5684. +#define PICO_REGVECT_INPIX2 cr0
  5685. +#define PICO_REGVECT_INPIX1 cr1
  5686. +#define PICO_REGVECT_INPIX0 cr2
  5687. +#define PICO_REGVECT_OUTPIX2 cr3
  5688. +#define PICO_REGVECT_OUTPIX1 cr4
  5689. +#define PICO_REGVECT_OUTPIX0 cr5
  5690. +#define PICO_REGVECT_COEFF0_A cr6
  5691. +#define PICO_REGVECT_COEFF0_B cr7
  5692. +#define PICO_REGVECT_COEFF1_A cr8
  5693. +#define PICO_REGVECT_COEFF1_B cr9
  5694. +#define PICO_REGVECT_COEFF2_A cr10
  5695. +#define PICO_REGVECT_COEFF2_B cr11
  5696. +#define PICO_REGVECT_VMU0_OUT cr12
  5697. +#define PICO_REGVECT_VMU1_OUT cr13
  5698. +#define PICO_REGVECT_VMU2_OUT cr14
  5699. +#define PICO_REGVECT_CONFIG cr15
  5700. +
  5701. +#define PICO_INPIX2 0
  5702. +#define PICO_INPIX1 1
  5703. +#define PICO_INPIX0 2
  5704. +#define PICO_OUTPIX2 3
  5705. +#define PICO_OUTPIX1 4
  5706. +#define PICO_OUTPIX0 5
  5707. +#define PICO_COEFF0_A 6
  5708. +#define PICO_COEFF0_B 7
  5709. +#define PICO_COEFF1_A 8
  5710. +#define PICO_COEFF1_B 9
  5711. +#define PICO_COEFF2_A 10
  5712. +#define PICO_COEFF2_B 11
  5713. +#define PICO_VMU0_OUT 12
  5714. +#define PICO_VMU1_OUT 13
  5715. +#define PICO_VMU2_OUT 14
  5716. +#define PICO_CONFIG 15
  5717. +
  5718. +/* Config Register */
  5719. +#define PICO_COEFF_FRAC_BITS 0
  5720. +#define PICO_COEFF_FRAC_BITS_WIDTH 4
  5721. +#define PICO_OFFSET_FRAC_BITS 4
  5722. +#define PICO_OFFSET_FRAC_BITS_WIDTH 4
  5723. +#define PICO_INPUT_MODE 8
  5724. +#define PICO_INPUT_MODE_WIDTH 2
  5725. +#define PICO_OUTPUT_MODE 10
  5726. +
  5727. +#define PICO_TRANSFORMATION_MODE 0
  5728. +#define PICO_HOR_FILTER_MODE 1
  5729. +#define PICO_VERT_FILTER_MODE 2
  5730. +
  5731. +#define PICO_PLANAR_MODE 1
  5732. +#define PICO_PACKED_MODE 0
  5733. +
  5734. +/* Bits in coefficients */
  5735. +#define PICO_COEFF_BITS 12
  5736. +
  5737. +/* Operation bits */
  5738. +#define PICO_USE_ACC (1 << 2)
  5739. +#define PICO_SINGLE_VECTOR (1 << 3)
  5740. +
  5741. +
  5742. +#define __str(x...) #x
  5743. +#define __xstr(x...) __str(x)
  5744. +
  5745. +#define PICO_PUT_W(pico_reg, x) \
  5746. + __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
  5747. +#define PICO_GET_W(pico_reg) \
  5748. + __builtin_mvcr_w(PICO_CPNO, pico_reg)
  5749. +
  5750. +#define PICO_PUT_D(pico_reg, x) \
  5751. + __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
  5752. +#define PICO_GET_D(pico_reg) \
  5753. + __builtin_mvcr_d(PICO_CPNO, pico_reg)
  5754. +
  5755. +
  5756. +#define PICO_STCM_W(ptr, pico_regs...) \
  5757. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5758. +#define PICO_STCM_D(ptr, pico_regs...) \
  5759. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5760. +
  5761. +#define PICO_STCM_W_DEC(ptr, pico_regs...) \
  5762. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5763. +#define PICO_STCM_D_DEC(ptr, pico_regs...) \
  5764. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5765. +
  5766. +#define PICO_LDCM_W(ptr, pico_regs...) \
  5767. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5768. +#define PICO_LDCM_D(ptr, pico_regs...) \
  5769. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5770. +
  5771. +#define PICO_LDCM_W_INC(ptr, pico_regs...) \
  5772. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5773. +#define PICO_LDCM_D_INC(ptr, pico_regs...) \
  5774. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5775. +
  5776. +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
  5777. + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
  5778. +
  5779. +
  5780. +#endif
  5781. +
  5782. diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
  5783. index ecd28f5..3221d0c 100644
  5784. --- a/libswscale/swscale_internal.h
  5785. +++ b/libswscale/swscale_internal.h
  5786. @@ -173,7 +173,7 @@ typedef struct SwsContext{
  5787. SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
  5788. int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
  5789. -char *sws_format_name(int format);
  5790. +char *sws_format_name(enum PixelFormat format);
  5791. //FIXME replace this with something faster
  5792. #define isPlanarYUV(x) ((x)==PIX_FMT_YUV410P || (x)==PIX_FMT_YUV420P \
  5793. diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
  5794. index 71759bc..fa83985 100644
  5795. --- a/libswscale/yuv2rgb.c
  5796. +++ b/libswscale/yuv2rgb.c
  5797. @@ -44,6 +44,10 @@
  5798. #include "yuv2rgb_mlib.c"
  5799. #endif
  5800. +#ifdef ARCH_AVR32
  5801. +#include "yuv2rgb_avr32.c"
  5802. +#endif
  5803. +
  5804. #define DITHER1XBPP // only for mmx
  5805. const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
  5806. @@ -601,6 +605,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
  5807. if(t) return t;
  5808. }
  5809. #endif
  5810. +#ifdef ARCH_AVR32
  5811. + {
  5812. + SwsFunc t= yuv2rgb_init_avr32(c);
  5813. + if(t) return t;
  5814. + }
  5815. +#endif
  5816. #ifdef HAVE_ALTIVEC
  5817. if (c->flags & SWS_CPU_CAPS_ALTIVEC)
  5818. {
  5819. @@ -678,6 +688,10 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange,
  5820. //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
  5821. oy -= 256*brightness;
  5822. +#ifdef ARCH_AVR32
  5823. + yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation);
  5824. +#endif
  5825. +
  5826. for (i = 0; i < 1024; i++) {
  5827. int j;
  5828. diff --git a/libswscale/yuv2rgb_avr32.c b/libswscale/yuv2rgb_avr32.c
  5829. new file mode 100644
  5830. index 0000000..4a8341e
  5831. --- /dev/null
  5832. +++ b/libswscale/yuv2rgb_avr32.c
  5833. @@ -0,0 +1,416 @@
  5834. +/*
  5835. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  5836. + *
  5837. + * Redistribution and use in source and binary forms, with or without
  5838. + * modification, are permitted provided that the following conditions
  5839. + * are met:
  5840. + *
  5841. + * 1. Redistributions of source code must retain the above copyright
  5842. + * notice, this list of conditions and the following disclaimer.
  5843. + *
  5844. + * 2. Redistributions in binary form must reproduce the above
  5845. + * copyright notice, this list of conditions and the following
  5846. + * disclaimer in the documentation and/or other materials provided
  5847. + * with the distribution.
  5848. + *
  5849. + * 3. The name of ATMEL may not be used to endorse or promote products
  5850. + * derived from this software without specific prior written
  5851. + * permission.
  5852. + *
  5853. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  5854. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  5855. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  5856. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  5857. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  5858. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  5859. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  5860. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  5861. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  5862. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  5863. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  5864. + * DAMAGE.
  5865. + */
  5866. +#include "pico-avr32.h"
  5867. +
  5868. +
  5869. +#define RGB(uv_part) \
  5870. + __asm__ volatile ( \
  5871. + "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \
  5872. + "ld.w\t%1, %4[%8:" uv_part " << 2]\n\t" /* g = c->table_gU[U] */ \
  5873. + "ld.w\t%2, %5[%8:" uv_part " << 2]\n\t" /* b = c->table_bU[U] */ \
  5874. + "add\t%1, %0\n\t" /* g += tmp */\
  5875. + "ld.w\t%0, %6[%7:" uv_part " << 2]" /* r = c->table_rV[V] */ \
  5876. + : "=&r" (r), "=&r" (g), "=&r" (b) \
  5877. + : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \
  5878. + "r" (&c->table_rV[0]), "r" (V), "r" (U));
  5879. +
  5880. +
  5881. +#undef YUV2RGB1
  5882. +#define YUV2RGB1(dst, src, y, idx) \
  5883. + { int tmp2; __asm__ volatile ( \
  5884. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5885. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5886. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5887. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
  5888. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5889. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5890. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5891. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
  5892. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5893. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5894. + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
  5895. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5896. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5897. + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
  5898. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5899. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5900. +
  5901. +#undef YUV2RGB2
  5902. +#define YUV2RGB2(dst, src, y, idx) \
  5903. + { int tmp2; __asm__ volatile ( \
  5904. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5905. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5906. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5907. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
  5908. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5909. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5910. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5911. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
  5912. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5913. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5914. + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
  5915. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5916. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5917. + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
  5918. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5919. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5920. +
  5921. +
  5922. +#undef YUV2BGR1
  5923. +#define YUV2BGR1(dst, src, y, idx) \
  5924. + { int tmp2; __asm__ volatile ( \
  5925. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5926. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5927. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5928. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
  5929. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5930. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5931. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5932. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
  5933. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5934. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5935. + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
  5936. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5937. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5938. + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
  5939. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5940. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5941. +
  5942. +#undef YUV2BGR2
  5943. +#define YUV2BGR2(dst, src, y, idx) \
  5944. + { int tmp2; __asm__ volatile ( \
  5945. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5946. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5947. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5948. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
  5949. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5950. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5951. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5952. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
  5953. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5954. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5955. + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
  5956. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5957. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5958. + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
  5959. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5960. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5961. +
  5962. +
  5963. +
  5964. +int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  5965. + int srcSliceH, uint8_t* dst[], int dstStride[]){
  5966. + int y;
  5967. +
  5968. + if(c->srcFormat == PIX_FMT_YUV422P){
  5969. + srcStride[1] *= 2;
  5970. + srcStride[2] *= 2;
  5971. + }
  5972. +
  5973. +
  5974. + for(y=0; y<srcSliceH; y+=2){
  5975. + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
  5976. + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
  5977. + uint32_t *r, *g, *b;
  5978. + uint8_t *py_1= src[0] + y*srcStride[0];
  5979. + uint8_t *py_2= py_1 + srcStride[0];
  5980. + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
  5981. + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
  5982. + unsigned int h_size= c->dstW>>3;
  5983. + while (h_size--) {
  5984. + uint32_t U, V, Y1, Y2, tmp;
  5985. + U = ((uint32_t*)pu)[0];
  5986. + V = ((uint32_t*)pv)[0];
  5987. +
  5988. + RGB("t")
  5989. + YUV2BGR1(dst_1, py_1, Y1, 0)
  5990. + YUV2BGR1(dst_2, py_2, Y2, 0)
  5991. +
  5992. + RGB("u")
  5993. + YUV2BGR2(dst_1, py_1, Y1, 1)
  5994. + YUV2BGR2(dst_2, py_2, Y2, 1)
  5995. +
  5996. + RGB("l")
  5997. + YUV2BGR1(dst_1, py_1, Y1, 2)
  5998. + YUV2BGR1(dst_2, py_2, Y2, 2)
  5999. +
  6000. + RGB("b")
  6001. + YUV2BGR2(dst_1, py_1, Y1, 3)
  6002. + YUV2BGR2(dst_2, py_2, Y2, 3)
  6003. +
  6004. +
  6005. +
  6006. + pu += 4;
  6007. + pv += 4;
  6008. + py_1 += 8;
  6009. + py_2 += 8;
  6010. + dst_1 += 24;
  6011. + dst_2 += 24;
  6012. + }
  6013. + }
  6014. + return srcSliceH;
  6015. +}
  6016. +
  6017. +
  6018. +
  6019. +static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  6020. + int srcSliceH, uint8_t* dst[], int dstStride[]){
  6021. + int y;
  6022. +
  6023. + if(c->srcFormat == PIX_FMT_YUV422P){
  6024. + srcStride[1] *= 2;
  6025. + srcStride[2] *= 2;
  6026. + }
  6027. + for(y=0; y<srcSliceH; y+=2){
  6028. + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
  6029. + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
  6030. + uint8_t *r, *g, *b;
  6031. + uint8_t *py_1= src[0] + y*srcStride[0];
  6032. + uint8_t *py_2= py_1 + srcStride[0];
  6033. + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
  6034. + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
  6035. + unsigned int h_size= c->dstW>>3;
  6036. + while (h_size--) {
  6037. + uint32_t U, V, Y1, Y2, tmp;
  6038. + U = ((uint32_t*)pu)[0];
  6039. + V = ((uint32_t*)pv)[0];
  6040. +
  6041. + RGB("t")
  6042. + YUV2RGB1(dst_1, py_1, Y1, 0)
  6043. + YUV2RGB1(dst_2, py_2, Y2, 0)
  6044. +
  6045. + RGB("u")
  6046. + YUV2RGB2(dst_1, py_1, Y1, 1)
  6047. + YUV2RGB2(dst_2, py_2, Y2, 1)
  6048. +
  6049. + RGB("l")
  6050. + YUV2RGB1(dst_1, py_1, Y1, 2)
  6051. + YUV2RGB1(dst_2, py_2, Y2, 2)
  6052. +
  6053. + RGB("b")
  6054. + YUV2RGB2(dst_1, py_1, Y1, 3)
  6055. + YUV2RGB2(dst_2, py_2, Y2, 3)
  6056. +
  6057. + pu += 4;
  6058. + pv += 4;
  6059. + py_1 += 8;
  6060. + py_2 += 8;
  6061. + dst_1 += 24;
  6062. + dst_2 += 24;
  6063. + }
  6064. + }
  6065. + return srcSliceH;
  6066. +}
  6067. +
  6068. +#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits)
  6069. +#define COEFF_FRAC_BITS 9
  6070. +#define OFFSET_FRAC_BITS 2
  6071. +
  6072. +/* Coefficients used in the pico */
  6073. +static struct {
  6074. + short coeff2_2;
  6075. + short coeff2_3;
  6076. + short coeff2_0;
  6077. + short coeff2_1;
  6078. + short coeff1_2;
  6079. + short coeff1_3;
  6080. + short coeff1_0;
  6081. + short coeff1_1;
  6082. + short coeff0_2;
  6083. + short coeff0_3;
  6084. + short coeff0_0;
  6085. + short coeff0_1;
  6086. +} pico_coeff;
  6087. +
  6088. +
  6089. +static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  6090. + int srcSliceH, uint8_t* dst[], int dstStride[]){
  6091. + int y;
  6092. + static int first_time = 1;
  6093. +
  6094. + /* Initialize pico */
  6095. + PICO_LDCM_D(&pico_coeff,
  6096. + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
  6097. + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
  6098. + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B);
  6099. +
  6100. + PICO_PUT_W(PICO_CONFIG,
  6101. + (PICO_PACKED_MODE << PICO_OUTPUT_MODE
  6102. + | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE
  6103. + | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS
  6104. + | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS));
  6105. +
  6106. +
  6107. + if(c->srcFormat == PIX_FMT_YUV422P){
  6108. + srcStride[1] *= 2;
  6109. + srcStride[2] *= 2;
  6110. + }
  6111. +
  6112. + for(y=0; y<srcSliceH; y+=2){
  6113. + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
  6114. + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
  6115. + uint8_t *r, *g, *b;
  6116. + uint8_t *py_1= src[0] + y*srcStride[0];
  6117. + uint8_t *py_2= py_1 + srcStride[0];
  6118. + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
  6119. + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
  6120. + unsigned int h_size= c->dstW>>3;
  6121. + int *py_1_int = (int *)py_1;
  6122. + int *py_2_int = (int *)py_2;
  6123. + int *pu_int = (int *)pu;
  6124. + int *pv_int = (int *)pv;
  6125. + while (h_size--) {
  6126. + PICO_PUT_W(PICO_INPIX0, *py_1_int++);
  6127. + PICO_PUT_W(PICO_INPIX1, *pu_int++);
  6128. + PICO_PUT_W(PICO_INPIX2, *pv_int++);
  6129. + PICO_OP(0, 0, 0, 4, 8);
  6130. + PICO_OP(0, 1, 1, 4, 8);
  6131. + PICO_OP(0, 2, 2, 5, 9);
  6132. + PICO_OP(0, 3, 3, 5, 9);
  6133. + PICO_PUT_W(PICO_INPIX0, *py_1_int++);
  6134. + PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  6135. + PICO_OP(0, 0, 0, 6, 10);
  6136. + PICO_OP(0, 1, 1, 6, 10);
  6137. + PICO_OP(0, 2, 2, 7, 11);
  6138. + PICO_OP(0, 3, 3, 7, 11);
  6139. + PICO_PUT_W(PICO_INPIX0, *py_2_int++);
  6140. + PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  6141. +
  6142. + PICO_OP(0, 0, 0, 4, 8);
  6143. + PICO_OP(0, 1, 1, 4, 8);
  6144. + PICO_OP(0, 2, 2, 5, 9);
  6145. + PICO_OP(0, 3, 3, 5, 9);
  6146. + PICO_PUT_W(PICO_INPIX0, *py_2_int++);
  6147. + PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  6148. + PICO_OP(0, 0, 0, 6, 10);
  6149. + PICO_OP(0, 1, 1, 6, 10);
  6150. + PICO_OP(0, 2, 2, 7, 11);
  6151. + PICO_OP(0, 3, 3, 7, 11);
  6152. + PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  6153. +
  6154. + dst_1 += 24;
  6155. + dst_2 += 24;
  6156. + }
  6157. + }
  6158. + return srcSliceH;
  6159. +}
  6160. +
  6161. +extern int avr32_use_pico;
  6162. +
  6163. +SwsFunc yuv2rgb_init_avr32 (SwsContext *c){
  6164. + switch(c->dstFormat){
  6165. + case PIX_FMT_BGR24:
  6166. + {
  6167. + if ( avr32_use_pico ){
  6168. + MSG_ERR("AVR32 BGR24: Using PICO for color space conversion\n");
  6169. + return yuv2bgr24_avr32_pico;
  6170. + } else {
  6171. + MSG_ERR("AVR32 BGR24: Using optimized color space conversion\n");
  6172. + return yuv2bgr24_avr32;
  6173. + }
  6174. + }
  6175. + break;
  6176. + case PIX_FMT_RGB24:
  6177. + {
  6178. + if ( avr32_use_pico ){
  6179. + MSG_ERR("AVR32 RGB24: Using PICO for color space conversion\n");
  6180. + return yuv2bgr24_avr32_pico;
  6181. + } else {
  6182. + MSG_ERR("AVR32 RGB24: Using optimized color space conversion\n");
  6183. + return yuv2rgb24_avr32;
  6184. + }
  6185. + }
  6186. + }
  6187. + return NULL;
  6188. +}
  6189. +
  6190. +
  6191. +int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){
  6192. + const int isRgb = (c->dstFormat == PIX_FMT_RGB24);
  6193. +
  6194. + int64_t crv = inv_table[0];
  6195. + int64_t cbu = inv_table[1];
  6196. + int64_t cgu = -inv_table[2];
  6197. + int64_t cgv = -inv_table[3];
  6198. + int64_t cy = 1<<16;
  6199. + int64_t oy = 0;
  6200. +
  6201. + if(!fullRange){
  6202. + cy= (cy*255) / 219;
  6203. + oy= 16<<16;
  6204. + }
  6205. +
  6206. + cy = (cy *contrast )>>16;
  6207. + crv= (crv*contrast * saturation)>>32;
  6208. + cbu= (cbu*contrast * saturation)>>32;
  6209. + cgu= (cgu*contrast * saturation)>>32;
  6210. + cgv= (cgv*contrast * saturation)>>32;
  6211. +
  6212. + oy -= 256*brightness;
  6213. +
  6214. + pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */
  6215. + pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */
  6216. + pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */
  6217. + pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS)
  6218. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */
  6219. +
  6220. + if ( isRgb ){
  6221. + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
  6222. + pico_coeff.coeff0_1 = 0; /* R <- U */
  6223. + pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
  6224. + pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
  6225. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
  6226. +
  6227. + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
  6228. + pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
  6229. + pico_coeff.coeff2_2 = 0; /* B <- V */
  6230. + pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
  6231. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */
  6232. + } else {
  6233. + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
  6234. + pico_coeff.coeff2_1 = 0; /* R <- U */
  6235. + pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
  6236. + pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
  6237. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
  6238. +
  6239. + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
  6240. + pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
  6241. + pico_coeff.coeff0_2 = 0; /* B <- V */
  6242. + pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
  6243. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */
  6244. + }
  6245. +
  6246. +}
  6247. +
  6248. +
  6249. +#undef RGB
  6250. diff --git a/libvo/vo_fbdev2.c b/libvo/vo_fbdev2.c
  6251. index 053c193..7017770 100644
  6252. --- a/libvo/vo_fbdev2.c
  6253. +++ b/libvo/vo_fbdev2.c
  6254. @@ -22,6 +22,9 @@
  6255. #include "sub.h"
  6256. #include "mp_msg.h"
  6257. +/* Draw directly to framebuffer */
  6258. +#define USE_CONVERT2FB
  6259. +
  6260. static vo_info_t info = {
  6261. "Framebuffer Device",
  6262. "fbdev2",
  6263. @@ -178,6 +181,15 @@ static int fb_preinit(int reset)
  6264. }
  6265. fb_orig_vinfo = fb_vinfo;
  6266. + /* Reset panning offset */
  6267. + fb_vinfo.yoffset = 0;
  6268. + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
  6269. + mp_msg(MSGT_VO, MSGL_ERR,
  6270. + "[fbdev2] FBIOPAN_DISPLAY failed: %s\n",
  6271. + strerror(errno));
  6272. + return 0;
  6273. + }
  6274. +
  6275. fb_bpp = fb_vinfo.bits_per_pixel;
  6276. /* 16 and 15 bpp is reported as 16 bpp */
  6277. @@ -289,6 +301,10 @@ static int config(uint32_t width, uint32_t height, uint32_t d_width,
  6278. mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno));
  6279. return 1;
  6280. }
  6281. +#else
  6282. + if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2)
  6283. + && fb_vinfo.yoffset == 0)
  6284. + center += fb_line_len * fb_vinfo.yres;
  6285. #endif
  6286. if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres);
  6287. @@ -299,14 +315,22 @@ static int query_format(uint32_t format)
  6288. {
  6289. // open the device, etc.
  6290. if (fb_preinit(0)) return 0;
  6291. - if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) {
  6292. + if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) {
  6293. int fb_target_bpp = format & 0xff;
  6294. set_bpp(&fb_vinfo, fb_target_bpp);
  6295. fb_vinfo.xres_virtual = fb_vinfo.xres;
  6296. - fb_vinfo.yres_virtual = fb_vinfo.yres;
  6297. + fb_vinfo.yres_virtual = fb_vinfo.yres * 2;
  6298. if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
  6299. - mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno));
  6300. - return 0;
  6301. + mp_msg(MSGT_VO, MSGL_WARN,
  6302. + "[fbdev2] Can't double virtual y resolution: %s\n",
  6303. + strerror(errno));
  6304. + fb_vinfo.yres_virtual = fb_vinfo.yres;
  6305. + if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
  6306. + mp_msg(MSGT_VO, MSGL_ERR,
  6307. + "[fbdev2] Can't put VSCREENINFO: %s\n",
  6308. + strerror(errno));
  6309. + return -1;
  6310. + }
  6311. }
  6312. fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
  6313. fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length +
  6314. @@ -367,16 +391,67 @@ static void check_events(void)
  6315. static void flip_page(void)
  6316. {
  6317. -#ifndef USE_CONVERT2FB
  6318. int i, out_offset = 0, in_offset = 0;
  6319. - for (i = 0; i < in_height; i++) {
  6320. - memcpy(center + out_offset, next_frame + in_offset,
  6321. - in_width * fb_pixel_size);
  6322. - out_offset += fb_line_len;
  6323. - in_offset += in_width * fb_pixel_size;
  6324. - }
  6325. +#ifndef USE_CONVERT2FB
  6326. + if (1) {
  6327. +#else
  6328. + if (fb_vinfo.yres_virtual == fb_vinfo.yres) {
  6329. #endif
  6330. + for (i = 0; i < in_height; i++) {
  6331. + memcpy(center + out_offset, next_frame + in_offset,
  6332. + in_width * fb_pixel_size);
  6333. + out_offset += fb_line_len;
  6334. + in_offset += in_width * fb_pixel_size;
  6335. + }
  6336. + } else {
  6337. + if (fb_vinfo.yoffset == 0) {
  6338. + fb_vinfo.yoffset += fb_vinfo.yres;
  6339. + center -= fb_line_len * fb_vinfo.yres;
  6340. + } else {
  6341. + fb_vinfo.yoffset = 0;
  6342. + center += fb_line_len * fb_vinfo.yres;
  6343. + }
  6344. +
  6345. + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
  6346. + mp_msg(MSGT_VO, MSGL_ERR,
  6347. + "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n",
  6348. + strerror(errno));
  6349. + }
  6350. + }
  6351. +}
  6352. +
  6353. +static uint32_t get_image(mp_image_t *mpi)
  6354. +{
  6355. + if(mpi->flags&MP_IMGFLAG_READABLE)
  6356. + return VO_FALSE; // slow video ram
  6357. + if(mpi->type==MP_IMGTYPE_STATIC)
  6358. + return VO_FALSE; // it is not static
  6359. +
  6360. + if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) {
  6361. + // we're lucky or codec accepts stride => ok, let's go!
  6362. +
  6363. + //YUY2 and RGB formats
  6364. + mpi->planes[0] = center;
  6365. + mpi->width = in_width;
  6366. + mpi->stride[0] = fb_line_len;
  6367. +
  6368. + // center image
  6369. +
  6370. + mpi->flags |= MP_IMGFLAG_DIRECT;
  6371. +
  6372. + return VO_TRUE;
  6373. + }
  6374. +
  6375. + return VO_FALSE;
  6376. +}
  6377. +
  6378. +static uint32_t put_image(mp_image_t *mpi)
  6379. +{
  6380. + // already out?
  6381. + if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK)))
  6382. + return VO_TRUE;
  6383. + return VO_FALSE;
  6384. }
  6385. static void uninit(void)
  6386. @@ -403,6 +478,10 @@ static int control(uint32_t request, void *data, ...)
  6387. switch (request) {
  6388. case VOCTRL_QUERY_FORMAT:
  6389. return query_format(*((uint32_t*)data));
  6390. + case VOCTRL_GET_IMAGE:
  6391. + return get_image(data);
  6392. + case VOCTRL_DRAW_IMAGE:
  6393. + return put_image(data);
  6394. }
  6395. return VO_NOTIMPL;
  6396. }
  6397. diff --git a/version.sh b/version.sh
  6398. index 44b5c5d..cf22a68 100755
  6399. --- a/version.sh
  6400. +++ b/version.sh
  6401. @@ -1,2 +1,2 @@
  6402. #!/bin/sh
  6403. -echo "#define VERSION \"1.0rc1-$1\"" > version.h
  6404. +echo "#define VERSION \"1.0rc1.atmel.2-$1\"" > version.h