mplayer-1.0rc1-atmel.3.patch 209 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386
  1. cfg-common.h | 4 +
  2. cfg-mencoder.h | 4 +
  3. cfg-mplayer.h | 4 +
  4. configure | 13 +-
  5. libaf/af_format.c | 7 +
  6. libavcodec/Makefile | 7 +
  7. libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++
  8. libavcodec/avr32/fdct.S | 541 ++++++++
  9. libavcodec/avr32/h264idct.S | 451 +++++++
  10. libavcodec/avr32/idct.S | 829 ++++++++++++
  11. libavcodec/avr32/mc.S | 434 ++++++
  12. libavcodec/avr32/pico.h | 260 ++++
  13. libavcodec/bitstream.h | 77 +-
  14. libavcodec/dsputil.c | 3 +
  15. libavcodec/h264.c | 15 +
  16. libavutil/common.h | 16 +
  17. libavutil/internal.h | 9 +
  18. libfaad2/common.h | 2 +-
  19. libmpcodecs/ad_libmad.c | 5 +
  20. libswscale/pico-avr32.h | 137 ++
  21. libswscale/swscale_internal.h | 2 +-
  22. libswscale/yuv2rgb.c | 14 +
  23. libswscale/yuv2rgb_avr32.c | 416 ++++++
  24. libvo/vo_fbdev2.c | 101 ++-
  25. version.sh | 2 +-
  26. 25 files changed, 6011 insertions(+), 20 deletions(-)
  27. create mode 100644 libavcodec/avr32/dsputil_avr32.c
  28. create mode 100644 libavcodec/avr32/fdct.S
  29. create mode 100644 libavcodec/avr32/h264idct.S
  30. create mode 100644 libavcodec/avr32/idct.S
  31. create mode 100644 libavcodec/avr32/mc.S
  32. create mode 100644 libavcodec/avr32/pico.h
  33. create mode 100644 libswscale/pico-avr32.h
  34. create mode 100644 libswscale/yuv2rgb_avr32.c
  35. --- a/cfg-common.h
  36. +++ b/cfg-common.h
  37. @@ -235,6 +235,10 @@
  38. {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
  39. {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
  40. +#ifdef ARCH_AVR32
  41. + {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
  42. + {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
  43. +#endif
  44. // draw by slices or whole frame (useful with libmpeg2/libavcodec)
  45. {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
  46. {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
  47. --- a/cfg-mencoder.h
  48. +++ b/cfg-mencoder.h
  49. @@ -5,6 +5,10 @@
  50. #include "cfg-common.h"
  51. +#ifdef ARCH_AVR32
  52. +extern int avr32_use_pico;
  53. +#endif
  54. +
  55. #ifdef USE_FAKE_MONO
  56. extern int fakemono; // defined in dec_audio.c
  57. #endif
  58. --- a/cfg-mplayer.h
  59. +++ b/cfg-mplayer.h
  60. @@ -4,6 +4,10 @@
  61. #include "cfg-common.h"
  62. +#ifdef ARCH_AVR32
  63. +extern int avr32_use_pico;
  64. +#endif
  65. +
  66. extern int noconsolecontrols;
  67. #if defined(HAVE_FBDEV)||defined(HAVE_VESA)
  68. --- a/configure
  69. +++ b/configure
  70. @@ -1203,6 +1203,15 @@ EOF
  71. _optimizing="$proc"
  72. ;;
  73. + avr32)
  74. + _def_arch='#define ARCH_AVR32'
  75. + _target_arch='TARGET_ARCH_AVR32 = yes'
  76. + iproc='avr32'
  77. + proc=''
  78. + _march=''
  79. + _mcpu=''
  80. + _optimizing=''
  81. + ;;
  82. arm|armv4l|armv5tel)
  83. _def_arch='#define ARCH_ARMV4L 1'
  84. _target_arch='TARGET_ARCH_ARMV4L = yes'
  85. @@ -1533,7 +1542,7 @@ echores $_named_asm_args
  86. # Checking for CFLAGS
  87. _stripbinaries=yes
  88. if test "$_profile" != "" || test "$_debug" != "" ; then
  89. - CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile"
  90. + CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile"
  91. if test "$_cc_major" -ge "3" ; then
  92. CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'`
  93. fi
  94. @@ -3794,7 +3803,7 @@ fi
  95. echocheck "X11 headers presence"
  96. - for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do
  97. + for I in `echo $_inc_extra | sed s/-I//g`; do
  98. if test -f "$I/X11/Xlib.h" ; then
  99. _inc_x11="-I$I"
  100. _x11_headers="yes"
  101. --- a/libaf/af_format.c
  102. +++ b/libaf/af_format.c
  103. @@ -20,7 +20,14 @@
  104. // Integer to float conversion through lrintf()
  105. #ifdef HAVE_LRINTF
  106. #include <math.h>
  107. +
  108. +#ifdef ARCH_AVR32
  109. +#define lrintf(x) rint(x)
  110. +#define llrint(x) (long long)rint(x)
  111. +#else
  112. long int lrintf(float);
  113. +#endif
  114. +
  115. #else
  116. #define lrintf(x) ((int)(x))
  117. #endif
  118. --- a/libavcodec/Makefile
  119. +++ b/libavcodec/Makefile
  120. @@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) +
  121. sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc
  122. +# avr32 specific stuff
  123. +ifeq ($(TARGET_ARCH_AVR32),yes)
  124. +ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o
  125. +OBJS += avr32/dsputil_avr32.o
  126. +endif
  127. +
  128. # sun mediaLib specific stuff
  129. OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \
  130. @@ -419,6 +425,7 @@ tests: apiexample $(TESTS)
  131. clean::
  132. rm -f \
  133. i386/*.o i386/*~ \
  134. + avr32/*.o avr32/*~ \
  135. armv4l/*.o armv4l/*~ \
  136. mlib/*.o mlib/*~ \
  137. alpha/*.o alpha/*~ \
  138. --- /dev/null
  139. +++ b/libavcodec/avr32/dsputil_avr32.c
  140. @@ -0,0 +1,2678 @@
  141. +/*
  142. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  143. + *
  144. + * Redistribution and use in source and binary forms, with or without
  145. + * modification, are permitted provided that the following conditions
  146. + * are met:
  147. + *
  148. + * 1. Redistributions of source code must retain the above copyright
  149. + * notice, this list of conditions and the following disclaimer.
  150. + *
  151. + * 2. Redistributions in binary form must reproduce the above
  152. + * copyright notice, this list of conditions and the following
  153. + * disclaimer in the documentation and/or other materials provided
  154. + * with the distribution.
  155. + *
  156. + * 3. The name of ATMEL may not be used to endorse or promote products
  157. + * derived from this software without specific prior written
  158. + * permission.
  159. + *
  160. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  161. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  162. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  163. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  164. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  165. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  166. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  167. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  168. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  169. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  170. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  171. + * DAMAGE.
  172. + */
  173. +
  174. +#include "../dsputil.h"
  175. +#include "pico.h"
  176. +
  177. +int avr32_use_pico = 1;
  178. +
  179. +//#define CHECK_DSP_FUNCS_AGAINST_C
  180. +
  181. +#ifdef CHECK_DSP_FUNCS_AGAINST_C
  182. +#define DSP_FUNC_NAME(name) test_ ## name
  183. +#else
  184. +#define DSP_FUNC_NAME(name) name
  185. +#endif
  186. +
  187. +union doubleword {
  188. + int64_t doubleword;
  189. + struct {
  190. + int32_t top;
  191. + int32_t bottom;
  192. + } words;
  193. +};
  194. +
  195. +#undef LD16
  196. +#undef LD32
  197. +#undef LD64
  198. +
  199. +#define LD16(a) (*((uint16_t*)(a)))
  200. +#define LD32(a) (*((uint32_t*)(a)))
  201. +#define LD64(a) (*((uint64_t*)(a)))
  202. +#define LD64_UNALIGNED(a) \
  203. + ({ union doubleword __tmp__; \
  204. + __tmp__.words.top = LD32(a); \
  205. + __tmp__.words.bottom = LD32(a + 4); \
  206. + __tmp__.doubleword; })
  207. +
  208. +#undef ST32
  209. +#undef ST16
  210. +
  211. +#define ST16(a, b) *((uint16_t*)(a)) = (b)
  212. +#define ST32(a, b) *((uint32_t*)(a)) = (b)
  213. +
  214. +#undef rnd_avg32
  215. +#define rnd_avg32(a, b) \
  216. + ({ uint32_t __tmp__;\
  217. + asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
  218. + __tmp__;})
  219. +
  220. +void idct_avr32(DCTELEM *data);
  221. +void fdct_avr32(DCTELEM *data);
  222. +
  223. +void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
  224. +void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
  225. +
  226. +void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
  227. +void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
  228. +
  229. +#define extern_dspfunc(PFX, NUM) \
  230. + void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  231. + void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  232. + void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  233. + void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
  234. +
  235. +extern_dspfunc(put, 8);
  236. +extern_dspfunc(put_no_rnd, 8);
  237. +extern_dspfunc(avg, 8);
  238. +extern_dspfunc(avg_no_rnd, 8);
  239. +#undef extern_dspfunc
  240. +
  241. +#ifdef CHECK_DSP_FUNCS_AGAINST_C
  242. +#define extern_dspfunc(PFX, NUM) \
  243. + void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  244. + void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  245. + void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  246. + void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
  247. +
  248. +extern_dspfunc(put, 4);
  249. +extern_dspfunc(put_no_rnd, 4);
  250. +extern_dspfunc(put, 8);
  251. +extern_dspfunc(put_no_rnd, 8);
  252. +extern_dspfunc(put, 16);
  253. +extern_dspfunc(put_no_rnd, 16);
  254. +extern_dspfunc(avg, 8);
  255. +extern_dspfunc(avg_no_rnd, 8);
  256. +extern_dspfunc(avg, 16);
  257. +extern_dspfunc(avg_no_rnd, 16);
  258. +
  259. +
  260. +#undef extern_dspfunc
  261. +#define extern_dspfunc(PFX, NUM) \
  262. +void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \
  263. +void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \
  264. +void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \
  265. +void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \
  266. +void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \
  267. +void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \
  268. +void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \
  269. +void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \
  270. +void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \
  271. +void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \
  272. +void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \
  273. +void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \
  274. +void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \
  275. +void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \
  276. +void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \
  277. +void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \
  278. +
  279. +extern_dspfunc(put_h264_qpel, 16);
  280. +extern_dspfunc(put_h264_qpel, 8);
  281. +extern_dspfunc(put_h264_qpel, 4);
  282. +extern_dspfunc(avg_h264_qpel, 16);
  283. +extern_dspfunc(avg_h264_qpel, 8);
  284. +extern_dspfunc(avg_h264_qpel, 4);
  285. +
  286. +#undef extern_dspfunc
  287. +
  288. +void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  289. +void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  290. +void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  291. +
  292. +void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  293. +void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  294. +void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  295. +
  296. +
  297. +void dump_block8(uint8_t *block, int line_size, int h);
  298. +void dump_block4(uint8_t *block, int line_size, int h);
  299. +void dump_block(uint8_t *block, int line_size, int h, int w);
  300. +
  301. +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  302. + int h, char *name, int max_dev);
  303. +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  304. + int h, char *name, int max_dev);
  305. +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  306. + int h, int width, char *name, int max_dev);
  307. +
  308. +#define PIXOP2( OPNAME, OP ) \
  309. +void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  310. + int i;\
  311. + for(i=0; i<h; i++){\
  312. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  313. + pixels+=line_size;\
  314. + block +=line_size;\
  315. + }\
  316. +}\
  317. +void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  318. + int src_stride1, int src_stride2, int h){\
  319. + int i;\
  320. + for(i=0; i<h; i++){\
  321. + uint32_t a,b;\
  322. + a= LD32(&src1[i*src_stride1 ]);\
  323. + b= LD32(&src2[i*src_stride2 ]);\
  324. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  325. + a= LD32(&src1[i*src_stride1+4]);\
  326. + b= LD32(&src2[i*src_stride2+4]);\
  327. + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
  328. + }\
  329. +}\
  330. +\
  331. +void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  332. + int src_stride1, int src_stride2, int h){\
  333. + int i;\
  334. + for(i=0; i<h; i++){\
  335. + uint32_t a,b;\
  336. + a= LD32(&src1[i*src_stride1 ]);\
  337. + b= LD32(&src2[i*src_stride2 ]);\
  338. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  339. + }\
  340. +}\
  341. +\
  342. +void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  343. + int src_stride1, int src_stride2, int h){\
  344. + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  345. + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  346. +}\
  347. +
  348. +#else
  349. +#define PIXOP2( OPNAME, OP ) \
  350. +static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  351. + int i;\
  352. + for(i=0; i<h; i++){\
  353. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  354. + pixels+=line_size;\
  355. + block +=line_size;\
  356. + }\
  357. +}\
  358. +static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  359. + int i;\
  360. + for(i=0; i<h; i++){\
  361. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  362. + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
  363. + pixels+=line_size;\
  364. + block +=line_size;\
  365. + }\
  366. +}\
  367. +static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  368. + int i;\
  369. + for(i=0; i<h; i++){\
  370. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  371. + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
  372. + OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
  373. + OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
  374. + pixels+=line_size;\
  375. + block +=line_size;\
  376. + }\
  377. +}\
  378. +static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  379. + int src_stride1, int src_stride2, int h){\
  380. + int i;\
  381. + for(i=0; i<h; i++){\
  382. + uint32_t a,b;\
  383. + a= LD32(&src1[i*src_stride1 ]);\
  384. + b= LD32(&src2[i*src_stride2 ]);\
  385. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  386. + a= LD32(&src1[i*src_stride1+4]);\
  387. + b= LD32(&src2[i*src_stride2+4]);\
  388. + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
  389. + }\
  390. +}\
  391. +\
  392. +static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  393. + int src_stride1, int src_stride2, int h){\
  394. + int i;\
  395. + for(i=0; i<h; i++){\
  396. + uint32_t a,b;\
  397. + a= LD32(&src1[i*src_stride1 ]);\
  398. + b= LD32(&src2[i*src_stride2 ]);\
  399. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  400. + }\
  401. +}\
  402. +\
  403. +static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  404. + int src_stride1, int src_stride2, int h){\
  405. + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  406. + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  407. +}\
  408. +
  409. +#endif
  410. +
  411. +#define op_avg(a, b) a = rnd_avg32(a, b)
  412. +#define op_put(a, b) a = b
  413. +
  414. +PIXOP2(avg, op_avg)
  415. +PIXOP2(put, op_put)
  416. +#undef op_avg
  417. +#undef op_put
  418. +
  419. +
  420. +
  421. +static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
  422. +{
  423. + int i;
  424. + for(i=0; i<h; i++)
  425. + {
  426. + ST32(dst , LD32(src ));
  427. + dst+=dstStride;
  428. + src+=srcStride;
  429. + }
  430. +}
  431. +
  432. +static void clear_blocks_avr32(DCTELEM *blocks)
  433. +{
  434. + int n = 12;
  435. + uint64_t tmp1, tmp2;
  436. + blocks += 6*64;
  437. + asm volatile ( "mov\t%1, 0\n"
  438. + "mov\t%m1, 0\n"
  439. + "mov\t%2, 0\n"
  440. + "mov\t%m2, 0\n"
  441. + "0:\n"
  442. + "stm\t--%3, %1, %m1, %2, %m2\n"
  443. + "stm\t--%3, %1, %m1, %2, %m2\n"
  444. + "stm\t--%3, %1, %m1, %2, %m2\n"
  445. + "stm\t--%3, %1, %m1, %2, %m2\n"
  446. + "sub\t%0, 1\n"
  447. + "brne\t0b\n"
  448. + : "+r"(n), "=&r"(tmp1), "=&r"(tmp2),
  449. + "+r"(blocks));
  450. +}
  451. +
  452. +
  453. +static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
  454. +{
  455. + int i;
  456. + for(i=0; i<h; i++)
  457. + {
  458. + ST32(dst , LD32(src ));
  459. + ST32(dst+4 , LD32(src+4 ));
  460. + dst+=dstStride;
  461. + src+=srcStride;
  462. + }
  463. +}
  464. +
  465. +static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
  466. +{
  467. + int i;
  468. + for(i=0; i<h; i++)
  469. + {
  470. + ST32(dst , LD32(src ));
  471. + ST32(dst+4 , LD32(src+4 ));
  472. + ST32(dst+8 , LD32(src+8 ));
  473. + ST32(dst+12, LD32(src+12));
  474. + dst+=dstStride;
  475. + src+=srcStride;
  476. + }
  477. +}
  478. +
  479. +
  480. +static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  481. + const int A=(8-x)*(8-y);
  482. + const int B=( x)*(8-y);
  483. + const int C=(8-x)*( y);
  484. + const int D=( x)*( y);
  485. + int i;
  486. +
  487. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  488. + PICO_PUT_W(PICO_COEFF0_B, 32);
  489. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  490. + PICO_PUT_W(PICO_COEFF1_B, 0);
  491. + PICO_PUT_W(PICO_COEFF2_A, 0);
  492. + PICO_PUT_W(PICO_COEFF2_B, 0);
  493. + PICO_PUT_W(PICO_CONFIG,
  494. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  495. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  496. + | PICO_COEFF_FRAC_BITS(6)
  497. + | PICO_OFFSET_FRAC_BITS(6));
  498. +
  499. + for(i=0; i<h; i++)
  500. + {
  501. +
  502. + int src0 = LD32(src);
  503. + int src1 = LD32(src + stride);
  504. +
  505. + PICO_MVRC_W(PICO_INPIX0, src0);
  506. + PICO_MVRC_W(PICO_INPIX1, src1);
  507. + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
  508. + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
  509. + src += stride;
  510. + ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
  511. + dst += stride;
  512. + }
  513. +}
  514. +
  515. +
  516. +static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  517. + const int A=(8-x)*(8-y);\
  518. + const int B=( x)*(8-y);
  519. + const int C=(8-x)*( y);
  520. + const int D=( x)*( y);
  521. + int i;
  522. +
  523. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  524. + PICO_PUT_W(PICO_COEFF0_B, 32);
  525. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  526. + PICO_PUT_W(PICO_COEFF1_B, 0);
  527. + PICO_PUT_W(PICO_COEFF2_A, 0);
  528. + PICO_PUT_W(PICO_COEFF2_B, 0);
  529. + PICO_PUT_W(PICO_CONFIG,
  530. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  531. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  532. + | PICO_COEFF_FRAC_BITS(6)
  533. + | PICO_OFFSET_FRAC_BITS(6));
  534. +
  535. + for(i=0; i<h; i++)
  536. + {
  537. + /*
  538. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  539. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  540. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  541. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  542. + dst+= stride;
  543. + src+= stride;
  544. + */
  545. +
  546. + int src0 = LD32(src);
  547. + int src1 = (((int)src[4] << 24) | (int)src[stride]);
  548. + int src2 = LD32(src + stride + 1);
  549. +
  550. + PICO_MVRC_W(PICO_INPIX0, src0);
  551. + PICO_MVRC_W(PICO_INPIX1, src1);
  552. + PICO_MVRC_W(PICO_INPIX2, src2);
  553. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  554. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  555. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  556. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  557. + src += stride;
  558. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  559. +
  560. + dst += stride;
  561. + }
  562. +}
  563. +
  564. +static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  565. + const int A=(8-x)*(8-y);
  566. + const int B=( x)*(8-y);
  567. + const int C=(8-x)*( y);
  568. + const int D=( x)*( y);
  569. + int i;
  570. +
  571. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  572. + PICO_PUT_W(PICO_COEFF0_B, 32);
  573. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  574. + PICO_PUT_W(PICO_COEFF1_B, 0);
  575. + PICO_PUT_W(PICO_COEFF2_A, 0);
  576. + PICO_PUT_W(PICO_COEFF2_B, 0);
  577. + PICO_PUT_W(PICO_CONFIG,
  578. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  579. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  580. + | PICO_COEFF_FRAC_BITS(6)
  581. + | PICO_OFFSET_FRAC_BITS(6));
  582. +
  583. + for(i=0; i<h; i++)
  584. + {
  585. + /*
  586. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  587. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  588. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  589. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  590. + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
  591. + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
  592. + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
  593. + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
  594. + dst+= stride;
  595. + src+= stride;
  596. + */
  597. + int src0 = LD32(src);
  598. + int src1 = (((int)src[4] << 24) | (int)src[stride]);
  599. + int src2 = LD32(src + stride + 1);
  600. +
  601. + PICO_MVRC_W(PICO_INPIX0, src0);
  602. + PICO_MVRC_W(PICO_INPIX1, src1);
  603. + PICO_MVRC_W(PICO_INPIX2, src2);
  604. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  605. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  606. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  607. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  608. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  609. +
  610. + src0 = LD32(src + 4);
  611. + src1 = (src[8] << 24) | src[stride + 4];
  612. + src2 = LD32(src + stride + 5);
  613. +
  614. + PICO_MVRC_W(PICO_INPIX0, src0);
  615. + PICO_MVRC_W(PICO_INPIX1, src1);
  616. + PICO_MVRC_W(PICO_INPIX2, src2);
  617. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  618. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  619. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  620. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  621. + src += stride;
  622. + ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
  623. +
  624. + dst += stride;
  625. + }
  626. +}
  627. +
  628. +
  629. +static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  630. + const int A=(8-x)*(8-y);
  631. + const int B=( x)*(8-y);
  632. + const int C=(8-x)*( y);
  633. + const int D=( x)*( y);
  634. + int i;
  635. +
  636. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  637. + PICO_PUT_W(PICO_COEFF0_B, 32);
  638. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  639. + PICO_PUT_W(PICO_COEFF1_B, 0);
  640. + PICO_PUT_W(PICO_COEFF2_A, 0);
  641. + PICO_PUT_W(PICO_COEFF2_B, 0);
  642. + PICO_PUT_W(PICO_CONFIG,
  643. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  644. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  645. + | PICO_COEFF_FRAC_BITS(6)
  646. + | PICO_OFFSET_FRAC_BITS(6));
  647. +
  648. + for(i=0; i<h; i++)
  649. + {
  650. + int src0 = LD32(src);
  651. + int src1 = LD32(src + stride);
  652. +
  653. + PICO_MVRC_W(PICO_INPIX0, src0);
  654. + PICO_MVRC_W(PICO_INPIX1, src1);
  655. + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
  656. + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
  657. + src += stride;
  658. + ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
  659. + dst += stride;
  660. + }
  661. +}
  662. +
  663. +
  664. +static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  665. + const int A=(8-x)*(8-y);\
  666. + const int B=( x)*(8-y);
  667. + const int C=(8-x)*( y);
  668. + const int D=( x)*( y);
  669. + int i;
  670. +
  671. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  672. + PICO_PUT_W(PICO_COEFF0_B, 32);
  673. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  674. + PICO_PUT_W(PICO_COEFF1_B, 0);
  675. + PICO_PUT_W(PICO_COEFF2_A, 0);
  676. + PICO_PUT_W(PICO_COEFF2_B, 0);
  677. + PICO_PUT_W(PICO_CONFIG,
  678. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  679. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  680. + | PICO_COEFF_FRAC_BITS(6)
  681. + | PICO_OFFSET_FRAC_BITS(6));
  682. +
  683. + for(i=0; i<h; i++)
  684. + {
  685. + /*
  686. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  687. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  688. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  689. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  690. + dst+= stride;
  691. + src+= stride;
  692. + */
  693. +
  694. + int src0 = *((int *)src);
  695. + int src1 = (int)((src[4] << 24) | src[stride]);
  696. + int src2 = *((int *)(src + stride + 1));
  697. +
  698. + PICO_MVRC_W(PICO_INPIX0, src0);
  699. + PICO_MVRC_W(PICO_INPIX1, src1);
  700. + PICO_MVRC_W(PICO_INPIX2, src2);
  701. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  702. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  703. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  704. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  705. + src += stride;
  706. + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
  707. + dst += stride;
  708. + }
  709. +}
  710. +
  711. +static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  712. + const int A=(8-x)*(8-y);
  713. + const int B=( x)*(8-y);
  714. + const int C=(8-x)*( y);
  715. + const int D=( x)*( y);
  716. + int i;
  717. +
  718. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  719. + PICO_PUT_W(PICO_COEFF0_B, 32);
  720. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  721. + PICO_PUT_W(PICO_COEFF1_B, 0);
  722. + PICO_PUT_W(PICO_COEFF2_A, 0);
  723. + PICO_PUT_W(PICO_COEFF2_B, 0);
  724. + PICO_PUT_W(PICO_CONFIG,
  725. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  726. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  727. + | PICO_COEFF_FRAC_BITS(6)
  728. + | PICO_OFFSET_FRAC_BITS(6));
  729. +
  730. + for(i=0; i<h; i++)
  731. + {
  732. + /*
  733. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  734. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  735. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  736. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  737. + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
  738. + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
  739. + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
  740. + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
  741. + dst+= stride;
  742. + src+= stride;
  743. + */
  744. + int src0 = *((int *)src);
  745. + int src1 = (volatile int)((src[4] << 24) | src[stride]);
  746. + int src2 = *((int *)(src + stride + 1));
  747. +
  748. + PICO_MVRC_W(PICO_INPIX0, src0);
  749. + PICO_MVRC_W(PICO_INPIX1, src1);
  750. + PICO_MVRC_W(PICO_INPIX2, src2);
  751. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  752. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  753. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  754. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  755. + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
  756. +
  757. + src0 = *((int *)(src + 4));
  758. + src1 = (int)((src[8] << 24) | src[stride + 4]);
  759. + src2 = *((int *)(src + stride + 5));
  760. +
  761. + PICO_MVRC_W(PICO_INPIX0, src0);
  762. + PICO_MVRC_W(PICO_INPIX1, src1);
  763. + PICO_MVRC_W(PICO_INPIX2, src2);
  764. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  765. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  766. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  767. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  768. + src += stride;
  769. + ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
  770. + dst += stride;
  771. + }
  772. +}
  773. +
  774. +static struct pico_config_t h264_qpel4_h_lowpass_config = {
  775. + .input_mode = PICO_HOR_FILTER_MODE,
  776. + .output_mode = PICO_PLANAR_MODE,
  777. + .coeff_frac_bits = 5,
  778. + .offset_frac_bits = 5,
  779. + .coeff0_0 = 1,
  780. + .coeff0_1 = -5,
  781. + .coeff0_2 = 20,
  782. + .coeff0_3 = 16,
  783. + .coeff1_0 = 20,
  784. + .coeff1_1 = -5,
  785. + .coeff1_2 = 1,
  786. + .coeff1_3 = 0,
  787. + .coeff2_0 = 0,
  788. + .coeff2_1 = 0,
  789. + .coeff2_2 = 0,
  790. + .coeff2_3 = 0
  791. +};
  792. +
  793. +
  794. +
  795. +static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  796. + const int h=4;
  797. + int i;
  798. +
  799. + set_pico_config(&h264_qpel4_h_lowpass_config);
  800. +
  801. + for(i=0; i<h; i++){
  802. +
  803. + /*
  804. + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
  805. + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
  806. + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
  807. + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
  808. + dst+=dstStride;\
  809. + src+=srcStride;\ */
  810. + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
  811. + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
  812. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  813. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  814. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  815. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  816. + src += srcStride;
  817. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  818. + dst += dstStride;
  819. + }
  820. +}
  821. +
  822. +static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  823. + const int h=4;
  824. + int i;
  825. +
  826. + set_pico_config(&h264_qpel4_h_lowpass_config);
  827. +
  828. + for(i=0; i<h; i++){
  829. +
  830. + /*
  831. + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
  832. + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
  833. + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
  834. + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
  835. + dst+=dstStride;\
  836. + src+=srcStride;\ */
  837. +
  838. + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
  839. + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
  840. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  841. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  842. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  843. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  844. + src += srcStride;
  845. + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
  846. + dst += dstStride;
  847. + }
  848. +}
  849. +
  850. +static struct pico_config_t h264_qpel4_v_lowpass_config1 = {
  851. + .input_mode = PICO_VERT_FILTER_MODE,
  852. + .output_mode = PICO_PACKED_MODE,
  853. + .coeff_frac_bits = 5,
  854. + .offset_frac_bits = 5,
  855. + .coeff0_0 = 1,
  856. + .coeff0_1 = -5,
  857. + .coeff0_2 = 20,
  858. + .coeff0_3 = 16,
  859. + .coeff1_0 = 1,
  860. + .coeff1_1 = -5,
  861. + .coeff1_2 = 20,
  862. + .coeff1_3 = 16,
  863. + .coeff2_0 = 1,
  864. + .coeff2_1 = -5,
  865. + .coeff2_2 = 20,
  866. + .coeff2_3 = 16
  867. +};
  868. +
  869. +
  870. +
  871. +static struct pico_config_t h264_qpel4_v_lowpass_config2 = {
  872. + .input_mode = PICO_VERT_FILTER_MODE,
  873. + .output_mode = PICO_PLANAR_MODE,
  874. + .coeff_frac_bits = 5,
  875. + .offset_frac_bits = 5,
  876. + .coeff0_0 = 1,
  877. + .coeff0_1 = -5,
  878. + .coeff0_2 = 20,
  879. + .coeff0_3 = 16,
  880. + .coeff1_0 = 20,
  881. + .coeff1_1 = -5,
  882. + .coeff1_2 = 1,
  883. + .coeff1_3 = 0,
  884. + .coeff2_0 = 0,
  885. + .coeff2_1 = 0,
  886. + .coeff2_2 = 0,
  887. + .coeff2_3 = 0
  888. +};
  889. +
  890. +static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  891. +
  892. + /*
  893. + const int w=4;
  894. + uint8_t *cm = cropTbl + MAX_NEG_CROP;
  895. + int i;
  896. + for(i=0; i<w; i++)
  897. + {
  898. + const int srcB= src[-2*srcStride];\
  899. + const int srcA= src[-1*srcStride];\
  900. + const int src0= src[0 *srcStride];\
  901. + const int src1= src[1 *srcStride];\
  902. + const int src2= src[2 *srcStride];\
  903. + const int src3= src[3 *srcStride];\
  904. + const int src4= src[4 *srcStride];\
  905. + const int src5= src[5 *srcStride];\
  906. + const int src6= src[6 *srcStride];\
  907. + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  908. + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  909. + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  910. + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  911. + dst++;\
  912. + src++;\
  913. + */
  914. +
  915. + set_pico_config(&h264_qpel4_v_lowpass_config1);
  916. +
  917. + {
  918. + int srcB= LD32(src - 2*srcStride);
  919. + int srcA= LD32(src - 1*srcStride);
  920. + int src0= LD32(src + 0 *srcStride);
  921. + int src1= LD32(src + 1 *srcStride);
  922. + int src2= LD32(src + 2 *srcStride);
  923. + int src3= LD32(src + 3 *srcStride);
  924. + int src4= LD32(src + 4 *srcStride);
  925. + int src5= LD32(src + 5 *srcStride);
  926. + int src6= LD32(src + 6 *srcStride);
  927. +
  928. + /* First compute the leftmost three colums */
  929. + PICO_MVRC_W(PICO_INPIX0, srcB);
  930. + PICO_MVRC_W(PICO_INPIX1, srcA);
  931. + PICO_MVRC_W(PICO_INPIX2, src0);
  932. + PICO_OP(0, 0, 0, 3, 6);
  933. + PICO_MVRC_W(PICO_INPIX2, src1);
  934. + PICO_MVRC_W(PICO_INPIX1, src2);
  935. + PICO_MVRC_W(PICO_INPIX0, src3);
  936. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  937. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  938. + dst += dstStride;
  939. + PICO_MVRC_W(PICO_INPIX0, srcA);
  940. + PICO_MVRC_W(PICO_INPIX1, src0);
  941. + PICO_MVRC_W(PICO_INPIX2, src1);
  942. + PICO_OP(0, 0, 0, 3, 6);
  943. + PICO_MVRC_W(PICO_INPIX2, src2);
  944. + PICO_MVRC_W(PICO_INPIX1, src3);
  945. + PICO_MVRC_W(PICO_INPIX0, src4);
  946. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  947. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  948. + dst += dstStride;
  949. + PICO_MVRC_W(PICO_INPIX0, src0);
  950. + PICO_MVRC_W(PICO_INPIX1, src1);
  951. + PICO_MVRC_W(PICO_INPIX2, src2);
  952. + PICO_OP(0, 0, 0, 3, 6);
  953. + PICO_MVRC_W(PICO_INPIX2, src3);
  954. + PICO_MVRC_W(PICO_INPIX1, src4);
  955. + PICO_MVRC_W(PICO_INPIX0, src5);
  956. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  957. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  958. + dst += dstStride;
  959. + PICO_MVRC_W(PICO_INPIX0, src1);
  960. + PICO_MVRC_W(PICO_INPIX1, src2);
  961. + PICO_MVRC_W(PICO_INPIX2, src3);
  962. + PICO_OP(0, 0, 0, 3, 6);
  963. + PICO_MVRC_W(PICO_INPIX2, src4);
  964. + PICO_MVRC_W(PICO_INPIX1, src5);
  965. + PICO_MVRC_W(PICO_INPIX0, src6);
  966. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  967. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  968. + /* Now compute the last column */
  969. +
  970. + union wordbytes {
  971. + int word;
  972. + struct {
  973. + unsigned int t:8;
  974. + unsigned int u:8;
  975. + unsigned int l:8;
  976. + unsigned int b:8;
  977. + } bytes; } tmp1, tmp2, tmp3;
  978. +
  979. +
  980. + tmp1.bytes.t = srcB;
  981. + tmp1.bytes.u = src1;
  982. + tmp1.bytes.l = src4;
  983. +
  984. + tmp2.bytes.t = srcA;
  985. + tmp2.bytes.u = src2;
  986. + tmp2.bytes.l = src5;
  987. +
  988. + tmp3.bytes.t = src0;
  989. + tmp3.bytes.u = src3;
  990. + tmp3.bytes.l = src6;
  991. +
  992. + PICO_MVRC_W(PICO_INPIX0, tmp1.word);
  993. + PICO_MVRC_W(PICO_INPIX1, tmp2.word);
  994. + PICO_MVRC_W(PICO_INPIX2, tmp3.word);
  995. + set_pico_config(&h264_qpel4_v_lowpass_config2);
  996. +
  997. +
  998. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  999. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  1000. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  1001. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  1002. +
  1003. + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
  1004. + dst[3] = (char)(tmp1.bytes.b);
  1005. + dst[3 - dstStride] = (char)(tmp1.bytes.l);
  1006. + dst[3 - 2*dstStride] = (char)(tmp1.bytes.u);
  1007. + dst[3 - 3*dstStride] = (char)(tmp1.bytes.t);
  1008. +
  1009. + }
  1010. + /*}
  1011. +
  1012. +
  1013. + }*/
  1014. +}
  1015. +
  1016. +static void avg_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1017. +
  1018. + /*
  1019. + const int w=4;
  1020. + uint8_t *cm = cropTbl + MAX_NEG_CROP;
  1021. + int i;
  1022. + for(i=0; i<w; i++)
  1023. + {
  1024. + const int srcB= src[-2*srcStride];\
  1025. + const int srcA= src[-1*srcStride];\
  1026. + const int src0= src[0 *srcStride];\
  1027. + const int src1= src[1 *srcStride];\
  1028. + const int src2= src[2 *srcStride];\
  1029. + const int src3= src[3 *srcStride];\
  1030. + const int src4= src[4 *srcStride];\
  1031. + const int src5= src[5 *srcStride];\
  1032. + const int src6= src[6 *srcStride];\
  1033. + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  1034. + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  1035. + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  1036. + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  1037. + dst++;\
  1038. + src++;\
  1039. + */
  1040. + uint8_t tmp_block[4*4];
  1041. +
  1042. + set_pico_config(&h264_qpel4_v_lowpass_config1);
  1043. +
  1044. + {
  1045. + int srcB= LD32(src - 2*srcStride);
  1046. + int srcA= LD32(src - 1*srcStride);
  1047. + int src0= LD32(src + 0 *srcStride);
  1048. + int src1= LD32(src + 1 *srcStride);
  1049. + int src2= LD32(src + 2 *srcStride);
  1050. + int src3= LD32(src + 3 *srcStride);
  1051. + int src4= LD32(src + 4 *srcStride);
  1052. + int src5= LD32(src + 5 *srcStride);
  1053. + int src6= LD32(src + 6 *srcStride);
  1054. +
  1055. + /* First compute the leftmost three colums */
  1056. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1057. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1058. + PICO_MVRC_W(PICO_INPIX2, src0);
  1059. + PICO_OP(0, 0, 0, 3, 6);
  1060. + PICO_MVRC_W(PICO_INPIX2, src1);
  1061. + PICO_MVRC_W(PICO_INPIX1, src2);
  1062. + PICO_MVRC_W(PICO_INPIX0, src3);
  1063. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  1064. + ST32(tmp_block, PICO_GET_W(PICO_OUTPIX0));
  1065. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1066. + PICO_MVRC_W(PICO_INPIX1, src0);
  1067. + PICO_MVRC_W(PICO_INPIX2, src1);
  1068. + PICO_OP(0, 0, 0, 3, 6);
  1069. + PICO_MVRC_W(PICO_INPIX2, src2);
  1070. + PICO_MVRC_W(PICO_INPIX1, src3);
  1071. + PICO_MVRC_W(PICO_INPIX0, src4);
  1072. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  1073. + ST32(tmp_block + 4, PICO_GET_W(PICO_OUTPIX0));
  1074. + PICO_MVRC_W(PICO_INPIX0, src0);
  1075. + PICO_MVRC_W(PICO_INPIX1, src1);
  1076. + PICO_MVRC_W(PICO_INPIX2, src2);
  1077. + PICO_OP(0, 0, 0, 3, 6);
  1078. + PICO_MVRC_W(PICO_INPIX2, src3);
  1079. + PICO_MVRC_W(PICO_INPIX1, src4);
  1080. + PICO_MVRC_W(PICO_INPIX0, src5);
  1081. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  1082. + ST32(tmp_block + 8, PICO_GET_W(PICO_OUTPIX0));
  1083. + PICO_MVRC_W(PICO_INPIX0, src1);
  1084. + PICO_MVRC_W(PICO_INPIX1, src2);
  1085. + PICO_MVRC_W(PICO_INPIX2, src3);
  1086. + PICO_OP(0, 0, 0, 3, 6);
  1087. + PICO_MVRC_W(PICO_INPIX2, src4);
  1088. + PICO_MVRC_W(PICO_INPIX1, src5);
  1089. + PICO_MVRC_W(PICO_INPIX0, src6);
  1090. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  1091. + ST32(tmp_block + 12, PICO_GET_W(PICO_OUTPIX0));
  1092. + /* Now compute the last column */
  1093. +
  1094. + union wordbytes {
  1095. + int word;
  1096. + struct {
  1097. + unsigned int t:8;
  1098. + unsigned int u:8;
  1099. + unsigned int l:8;
  1100. + unsigned int b:8;
  1101. + } bytes; } tmp1, tmp2, tmp3;
  1102. +
  1103. +
  1104. + tmp1.bytes.t = srcB;
  1105. + tmp1.bytes.u = src1;
  1106. + tmp1.bytes.l = src4;
  1107. +
  1108. + tmp2.bytes.t = srcA;
  1109. + tmp2.bytes.u = src2;
  1110. + tmp2.bytes.l = src5;
  1111. +
  1112. + tmp3.bytes.t = src0;
  1113. + tmp3.bytes.u = src3;
  1114. + tmp3.bytes.l = src6;
  1115. +
  1116. + PICO_MVRC_W(PICO_INPIX0, tmp1.word);
  1117. + PICO_MVRC_W(PICO_INPIX1, tmp2.word);
  1118. + PICO_MVRC_W(PICO_INPIX2, tmp3.word);
  1119. + set_pico_config(&h264_qpel4_v_lowpass_config2);
  1120. +
  1121. +
  1122. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  1123. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  1124. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  1125. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  1126. +
  1127. + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
  1128. + tmp_block[3 + 3*4] = (char)(tmp1.bytes.b);
  1129. + tmp_block[3 + 2*4] = (char)(tmp1.bytes.l);
  1130. + tmp_block[3 + 1*4] = (char)(tmp1.bytes.u);
  1131. + tmp_block[3] = (char)(tmp1.bytes.t);
  1132. +
  1133. + /* Compute the average */
  1134. + srcB= LD32(dst);
  1135. + srcA= LD32(dst + dstStride);
  1136. + src0= LD32(dst + dstStride*2);
  1137. + src1= LD32(dst + dstStride*3);
  1138. +
  1139. + src2= LD32(tmp_block);
  1140. + src3= LD32(tmp_block + 4);
  1141. + src4= LD32(tmp_block + 8);
  1142. + src5= LD32(tmp_block + 12);
  1143. +
  1144. + ST32(dst, rnd_avg32(srcB, src2));
  1145. + ST32(dst + dstStride, rnd_avg32(srcA, src3));
  1146. + ST32(dst + 2*dstStride, rnd_avg32(src0, src4));
  1147. + ST32(dst + 3*dstStride, rnd_avg32(src1, src5));
  1148. + }
  1149. +}
  1150. +
  1151. +static struct pico_config_t h264_qpel4_hv_lowpass_config = {
  1152. + .input_mode = PICO_HOR_FILTER_MODE,
  1153. + .output_mode = PICO_PACKED_MODE,
  1154. + .coeff_frac_bits = 10,
  1155. + .offset_frac_bits = 10,
  1156. + .coeff0_0 = 1,
  1157. + .coeff0_1 = -5,
  1158. + .coeff0_2 = 20,
  1159. + .coeff0_3 = 512,
  1160. + .coeff1_0 = -5,
  1161. + .coeff1_1 = 25,
  1162. + .coeff1_2 = -100,
  1163. + .coeff1_3 = 0,
  1164. + .coeff2_0 = 20,
  1165. + .coeff2_1 = -100,
  1166. + .coeff2_2 = 400,
  1167. + .coeff2_3 = 0
  1168. +};
  1169. +
  1170. +static void put_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1171. +
  1172. + int32_t tmp_block[48];
  1173. + int32_t *tmp = tmp_block;
  1174. + int i;
  1175. +
  1176. + set_pico_config(&h264_qpel4_hv_lowpass_config);
  1177. +
  1178. + src -= 2;
  1179. + for ( i = 0; i < 2; i++ ){
  1180. + int srcB= LD32(src - 2*srcStride);
  1181. + int srcA= LD32(src - 1*srcStride);
  1182. + int src0= LD32(src + 0 *srcStride);
  1183. + int src1= LD32(src + 1 *srcStride);
  1184. + int src2= LD32(src + 2 *srcStride);
  1185. + int src3= LD32(src + 3 *srcStride);
  1186. + int src4= LD32(src + 4 *srcStride);
  1187. + int src5= LD32(src + 5 *srcStride);
  1188. + int src6= LD32(src + 6 *srcStride);
  1189. +
  1190. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1191. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1192. + PICO_MVRC_W(PICO_INPIX2, src0);
  1193. + PICO_OP(0, 0, 0, 4, 8);
  1194. + PICO_MVRC_W(PICO_INPIX2, src1);
  1195. + PICO_MVRC_W(PICO_INPIX1, src2);
  1196. + PICO_MVRC_W(PICO_INPIX0, src3);
  1197. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1198. + PICO_STCM_W(tmp,
  1199. + PICO_REGVECT_VMU0_OUT,
  1200. + PICO_REGVECT_VMU1_OUT,
  1201. + PICO_REGVECT_VMU2_OUT);
  1202. + tmp += 3;
  1203. +
  1204. + PICO_OP(0, 0, 1, 5, 9);
  1205. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1206. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1207. + PICO_MVRC_W(PICO_INPIX2, src0);
  1208. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1209. + PICO_STCM_W(tmp,
  1210. + PICO_REGVECT_VMU0_OUT,
  1211. + PICO_REGVECT_VMU1_OUT,
  1212. + PICO_REGVECT_VMU2_OUT);
  1213. + tmp += 3;
  1214. +
  1215. + PICO_MVRC_W(PICO_INPIX0, src1);
  1216. + PICO_OP(0, 0, 4, 8, 0);
  1217. + PICO_MVRC_W(PICO_INPIX2, src2);
  1218. + PICO_MVRC_W(PICO_INPIX1, src3);
  1219. + PICO_MVRC_W(PICO_INPIX0, src4);
  1220. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1221. + PICO_STCM_W(tmp,
  1222. + PICO_REGVECT_VMU0_OUT,
  1223. + PICO_REGVECT_VMU1_OUT,
  1224. + PICO_REGVECT_VMU2_OUT);
  1225. + tmp += 3;
  1226. +
  1227. + PICO_OP(0, 0, 1, 5, 9);
  1228. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1229. + PICO_MVRC_W(PICO_INPIX1, src0);
  1230. + PICO_MVRC_W(PICO_INPIX2, src1);
  1231. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1232. + PICO_STCM_W(tmp,
  1233. + PICO_REGVECT_VMU0_OUT,
  1234. + PICO_REGVECT_VMU1_OUT,
  1235. + PICO_REGVECT_VMU2_OUT);
  1236. + tmp += 3;
  1237. +
  1238. + PICO_MVRC_W(PICO_INPIX0, src2);
  1239. + PICO_OP(0, 0, 4, 8, 0);
  1240. + PICO_MVRC_W(PICO_INPIX2, src3);
  1241. + PICO_MVRC_W(PICO_INPIX1, src4);
  1242. + PICO_MVRC_W(PICO_INPIX0, src5);
  1243. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1244. + PICO_STCM_W(tmp,
  1245. + PICO_REGVECT_VMU0_OUT,
  1246. + PICO_REGVECT_VMU1_OUT,
  1247. + PICO_REGVECT_VMU2_OUT);
  1248. + tmp += 3;
  1249. +
  1250. + PICO_OP(0, 0, 1, 5, 9);
  1251. + PICO_MVRC_W(PICO_INPIX0, src0);
  1252. + PICO_MVRC_W(PICO_INPIX1, src1);
  1253. + PICO_MVRC_W(PICO_INPIX2, src2);
  1254. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1255. + PICO_STCM_W(tmp,
  1256. + PICO_REGVECT_VMU0_OUT,
  1257. + PICO_REGVECT_VMU1_OUT,
  1258. + PICO_REGVECT_VMU2_OUT);
  1259. + tmp += 3;
  1260. +
  1261. + PICO_MVRC_W(PICO_INPIX0, src3);
  1262. + PICO_OP(0, 0, 4, 8, 0);
  1263. + PICO_MVRC_W(PICO_INPIX2, src4);
  1264. + PICO_MVRC_W(PICO_INPIX1, src5);
  1265. + PICO_MVRC_W(PICO_INPIX0, src6);
  1266. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1267. + PICO_STCM_W(tmp,
  1268. + PICO_REGVECT_VMU0_OUT,
  1269. + PICO_REGVECT_VMU1_OUT,
  1270. + PICO_REGVECT_VMU2_OUT);
  1271. + tmp += 3;
  1272. +
  1273. + PICO_OP(0, 0, 1, 5, 9);
  1274. + PICO_MVRC_W(PICO_INPIX0, src1);
  1275. + PICO_MVRC_W(PICO_INPIX1, src2);
  1276. + PICO_MVRC_W(PICO_INPIX2, src3);
  1277. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1278. + PICO_STCM_W(tmp,
  1279. + PICO_REGVECT_VMU0_OUT,
  1280. + PICO_REGVECT_VMU1_OUT,
  1281. + PICO_REGVECT_VMU2_OUT);
  1282. + tmp += 3;
  1283. + src += 2;
  1284. + }
  1285. +
  1286. + src -= 1;
  1287. + tmp -= 48;
  1288. +
  1289. +
  1290. + PICO_PUT_W(PICO_CONFIG,
  1291. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  1292. + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
  1293. + | PICO_COEFF_FRAC_BITS(10)
  1294. + | PICO_OFFSET_FRAC_BITS(10));
  1295. +
  1296. + for ( i = 0; i < 2; i++ ){
  1297. + int srcB= LD32(src - 2*srcStride);
  1298. + int srcA= LD32(src - 1*srcStride);
  1299. + int src0= LD32(src + 0 *srcStride);
  1300. + int src1= LD32(src + 1 *srcStride);
  1301. + int src2= LD32(src + 2 *srcStride);
  1302. + int src3= LD32(src + 3 *srcStride);
  1303. + int src4= LD32(src + 4 *srcStride);
  1304. + int src5= LD32(src + 5 *srcStride);
  1305. + int src6= LD32(src + 6 *srcStride);
  1306. +
  1307. +
  1308. + PICO_LDCM_W_INC(tmp,
  1309. + PICO_REGVECT_VMU0_OUT,
  1310. + PICO_REGVECT_VMU1_OUT,
  1311. + PICO_REGVECT_VMU2_OUT);
  1312. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1313. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1314. + PICO_MVRC_W(PICO_INPIX2, src0);
  1315. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1316. + PICO_MVRC_W(PICO_INPIX2, src1);
  1317. + PICO_MVRC_W(PICO_INPIX1, src2);
  1318. + PICO_MVRC_W(PICO_INPIX0, src3);
  1319. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1320. +
  1321. + PICO_LDCM_W_INC(tmp,
  1322. + PICO_REGVECT_VMU0_OUT,
  1323. + PICO_REGVECT_VMU1_OUT,
  1324. + PICO_REGVECT_VMU2_OUT);
  1325. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1326. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1327. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1328. + PICO_MVRC_W(PICO_INPIX2, src0);
  1329. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1330. +
  1331. + PICO_LDCM_W_INC(tmp,
  1332. + PICO_REGVECT_VMU0_OUT,
  1333. + PICO_REGVECT_VMU1_OUT,
  1334. + PICO_REGVECT_VMU2_OUT);
  1335. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1336. + PICO_MVRC_W(PICO_INPIX1, src0);
  1337. + PICO_MVRC_W(PICO_INPIX2, src1);
  1338. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1339. + PICO_MVRC_W(PICO_INPIX2, src2);
  1340. + PICO_MVRC_W(PICO_INPIX1, src3);
  1341. + PICO_MVRC_W(PICO_INPIX0, src4);
  1342. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1343. +
  1344. + PICO_LDCM_W_INC(tmp,
  1345. + PICO_REGVECT_VMU0_OUT,
  1346. + PICO_REGVECT_VMU1_OUT,
  1347. + PICO_REGVECT_VMU2_OUT);
  1348. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1349. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1350. + PICO_MVRC_W(PICO_INPIX1, src0);
  1351. + PICO_MVRC_W(PICO_INPIX2, src1);
  1352. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1353. +
  1354. + ST16(dst + 0*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
  1355. + ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
  1356. +
  1357. +
  1358. + PICO_LDCM_W_INC(tmp,
  1359. + PICO_REGVECT_VMU0_OUT,
  1360. + PICO_REGVECT_VMU1_OUT,
  1361. + PICO_REGVECT_VMU2_OUT);
  1362. + PICO_MVRC_W(PICO_INPIX0, src0);
  1363. + PICO_MVRC_W(PICO_INPIX1, src1);
  1364. + PICO_MVRC_W(PICO_INPIX2, src2);
  1365. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1366. + PICO_MVRC_W(PICO_INPIX2, src3);
  1367. + PICO_MVRC_W(PICO_INPIX1, src4);
  1368. + PICO_MVRC_W(PICO_INPIX0, src5);
  1369. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1370. +
  1371. + PICO_LDCM_W_INC(tmp,
  1372. + PICO_REGVECT_VMU0_OUT,
  1373. + PICO_REGVECT_VMU1_OUT,
  1374. + PICO_REGVECT_VMU2_OUT);
  1375. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1376. + PICO_MVRC_W(PICO_INPIX0, src0);
  1377. + PICO_MVRC_W(PICO_INPIX1, src1);
  1378. + PICO_MVRC_W(PICO_INPIX2, src2);
  1379. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1380. +
  1381. + PICO_LDCM_W_INC(tmp,
  1382. + PICO_REGVECT_VMU0_OUT,
  1383. + PICO_REGVECT_VMU1_OUT,
  1384. + PICO_REGVECT_VMU2_OUT);
  1385. + PICO_MVRC_W(PICO_INPIX0, src1);
  1386. + PICO_MVRC_W(PICO_INPIX1, src2);
  1387. + PICO_MVRC_W(PICO_INPIX2, src3);
  1388. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1389. + PICO_MVRC_W(PICO_INPIX2, src4);
  1390. + PICO_MVRC_W(PICO_INPIX1, src5);
  1391. + PICO_MVRC_W(PICO_INPIX0, src6);
  1392. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1393. +
  1394. + PICO_LDCM_W_INC(tmp,
  1395. + PICO_REGVECT_VMU0_OUT,
  1396. + PICO_REGVECT_VMU1_OUT,
  1397. + PICO_REGVECT_VMU2_OUT);
  1398. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1399. + PICO_MVRC_W(PICO_INPIX0, src1);
  1400. + PICO_MVRC_W(PICO_INPIX1, src2);
  1401. + PICO_MVRC_W(PICO_INPIX2, src3);
  1402. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1403. +
  1404. + ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
  1405. + ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
  1406. +
  1407. + dst += 2;
  1408. + src += 2;
  1409. + }
  1410. +}
  1411. +
  1412. +
  1413. +
  1414. +
  1415. +static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1416. +
  1417. + int32_t tmp_block[48];
  1418. + int32_t *tmp = tmp_block;
  1419. + int i;
  1420. +
  1421. + set_pico_config(&h264_qpel4_hv_lowpass_config);
  1422. +
  1423. + src -= 2;
  1424. + for ( i = 0; i < 2; i++ ){
  1425. + int srcB= LD32(src - 2*srcStride);
  1426. + int srcA= LD32(src - 1*srcStride);
  1427. + int src0= LD32(src + 0 *srcStride);
  1428. + int src1= LD32(src + 1 *srcStride);
  1429. + int src2= LD32(src + 2 *srcStride);
  1430. + int src3= LD32(src + 3 *srcStride);
  1431. + int src4= LD32(src + 4 *srcStride);
  1432. + int src5= LD32(src + 5 *srcStride);
  1433. + int src6= LD32(src + 6 *srcStride);
  1434. +
  1435. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1436. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1437. + PICO_MVRC_W(PICO_INPIX2, src0);
  1438. + PICO_OP(0, 0, 0, 4, 8);
  1439. + PICO_MVRC_W(PICO_INPIX2, src1);
  1440. + PICO_MVRC_W(PICO_INPIX1, src2);
  1441. + PICO_MVRC_W(PICO_INPIX0, src3);
  1442. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1443. + PICO_STCM_W(tmp,
  1444. + PICO_REGVECT_VMU0_OUT,
  1445. + PICO_REGVECT_VMU1_OUT,
  1446. + PICO_REGVECT_VMU2_OUT);
  1447. + tmp += 3;
  1448. +
  1449. + PICO_OP(0, 0, 1, 5, 9);
  1450. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1451. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1452. + PICO_MVRC_W(PICO_INPIX2, src0);
  1453. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1454. + PICO_STCM_W(tmp,
  1455. + PICO_REGVECT_VMU0_OUT,
  1456. + PICO_REGVECT_VMU1_OUT,
  1457. + PICO_REGVECT_VMU2_OUT);
  1458. + tmp += 3;
  1459. +
  1460. + PICO_MVRC_W(PICO_INPIX0, src1);
  1461. + PICO_OP(0, 0, 4, 8, 0);
  1462. + PICO_MVRC_W(PICO_INPIX2, src2);
  1463. + PICO_MVRC_W(PICO_INPIX1, src3);
  1464. + PICO_MVRC_W(PICO_INPIX0, src4);
  1465. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1466. + PICO_STCM_W(tmp,
  1467. + PICO_REGVECT_VMU0_OUT,
  1468. + PICO_REGVECT_VMU1_OUT,
  1469. + PICO_REGVECT_VMU2_OUT);
  1470. + tmp += 3;
  1471. +
  1472. + PICO_OP(0, 0, 1, 5, 9);
  1473. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1474. + PICO_MVRC_W(PICO_INPIX1, src0);
  1475. + PICO_MVRC_W(PICO_INPIX2, src1);
  1476. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1477. + PICO_STCM_W(tmp,
  1478. + PICO_REGVECT_VMU0_OUT,
  1479. + PICO_REGVECT_VMU1_OUT,
  1480. + PICO_REGVECT_VMU2_OUT);
  1481. + tmp += 3;
  1482. +
  1483. + PICO_MVRC_W(PICO_INPIX0, src2);
  1484. + PICO_OP(0, 0, 4, 8, 0);
  1485. + PICO_MVRC_W(PICO_INPIX2, src3);
  1486. + PICO_MVRC_W(PICO_INPIX1, src4);
  1487. + PICO_MVRC_W(PICO_INPIX0, src5);
  1488. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1489. + PICO_STCM_W(tmp,
  1490. + PICO_REGVECT_VMU0_OUT,
  1491. + PICO_REGVECT_VMU1_OUT,
  1492. + PICO_REGVECT_VMU2_OUT);
  1493. + tmp += 3;
  1494. +
  1495. + PICO_OP(0, 0, 1, 5, 9);
  1496. + PICO_MVRC_W(PICO_INPIX0, src0);
  1497. + PICO_MVRC_W(PICO_INPIX1, src1);
  1498. + PICO_MVRC_W(PICO_INPIX2, src2);
  1499. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1500. + PICO_STCM_W(tmp,
  1501. + PICO_REGVECT_VMU0_OUT,
  1502. + PICO_REGVECT_VMU1_OUT,
  1503. + PICO_REGVECT_VMU2_OUT);
  1504. + tmp += 3;
  1505. +
  1506. + PICO_MVRC_W(PICO_INPIX0, src3);
  1507. + PICO_OP(0, 0, 4, 8, 0);
  1508. + PICO_MVRC_W(PICO_INPIX2, src4);
  1509. + PICO_MVRC_W(PICO_INPIX1, src5);
  1510. + PICO_MVRC_W(PICO_INPIX0, src6);
  1511. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1512. + PICO_STCM_W(tmp,
  1513. + PICO_REGVECT_VMU0_OUT,
  1514. + PICO_REGVECT_VMU1_OUT,
  1515. + PICO_REGVECT_VMU2_OUT);
  1516. + tmp += 3;
  1517. +
  1518. + PICO_OP(0, 0, 1, 5, 9);
  1519. + PICO_MVRC_W(PICO_INPIX0, src1);
  1520. + PICO_MVRC_W(PICO_INPIX1, src2);
  1521. + PICO_MVRC_W(PICO_INPIX2, src3);
  1522. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1523. + PICO_STCM_W(tmp,
  1524. + PICO_REGVECT_VMU0_OUT,
  1525. + PICO_REGVECT_VMU1_OUT,
  1526. + PICO_REGVECT_VMU2_OUT);
  1527. + tmp += 3;
  1528. + src += 2;
  1529. + }
  1530. +
  1531. + src -= 1;
  1532. + tmp -= 48;
  1533. +
  1534. +
  1535. + PICO_PUT_W(PICO_CONFIG,
  1536. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  1537. + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
  1538. + | PICO_COEFF_FRAC_BITS(10)
  1539. + | PICO_OFFSET_FRAC_BITS(10));
  1540. +
  1541. + for ( i = 0; i < 2; i++ ){
  1542. + int srcB= LD32(src - 2*srcStride);
  1543. + int srcA= LD32(src - 1*srcStride);
  1544. + int src0= LD32(src + 0 *srcStride);
  1545. + int src1= LD32(src + 1 *srcStride);
  1546. + int src2= LD32(src + 2 *srcStride);
  1547. + int src3= LD32(src + 3 *srcStride);
  1548. + int src4= LD32(src + 4 *srcStride);
  1549. + int src5= LD32(src + 5 *srcStride);
  1550. + int src6= LD32(src + 6 *srcStride);
  1551. +
  1552. + PICO_LDCM_W_INC(tmp,
  1553. + PICO_REGVECT_VMU0_OUT,
  1554. + PICO_REGVECT_VMU1_OUT,
  1555. + PICO_REGVECT_VMU2_OUT);
  1556. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1557. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1558. + PICO_MVRC_W(PICO_INPIX2, src0);
  1559. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1560. + PICO_MVRC_W(PICO_INPIX2, src1);
  1561. + PICO_MVRC_W(PICO_INPIX1, src2);
  1562. + PICO_MVRC_W(PICO_INPIX0, src3);
  1563. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1564. +
  1565. + PICO_LDCM_W_INC(tmp,
  1566. + PICO_REGVECT_VMU0_OUT,
  1567. + PICO_REGVECT_VMU1_OUT,
  1568. + PICO_REGVECT_VMU2_OUT);
  1569. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1570. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1571. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1572. + PICO_MVRC_W(PICO_INPIX2, src0);
  1573. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1574. +
  1575. + PICO_LDCM_W_INC(tmp,
  1576. + PICO_REGVECT_VMU0_OUT,
  1577. + PICO_REGVECT_VMU1_OUT,
  1578. + PICO_REGVECT_VMU2_OUT);
  1579. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1580. + PICO_MVRC_W(PICO_INPIX1, src0);
  1581. + PICO_MVRC_W(PICO_INPIX2, src1);
  1582. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1583. + PICO_MVRC_W(PICO_INPIX2, src2);
  1584. + PICO_MVRC_W(PICO_INPIX1, src3);
  1585. + PICO_MVRC_W(PICO_INPIX0, src4);
  1586. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1587. +
  1588. + PICO_LDCM_W_INC(tmp,
  1589. + PICO_REGVECT_VMU0_OUT,
  1590. + PICO_REGVECT_VMU1_OUT,
  1591. + PICO_REGVECT_VMU2_OUT);
  1592. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1593. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1594. + PICO_MVRC_W(PICO_INPIX1, src0);
  1595. + PICO_MVRC_W(PICO_INPIX2, src1);
  1596. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1597. +
  1598. + ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
  1599. + ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0)));
  1600. +
  1601. +
  1602. + PICO_LDCM_W_INC(tmp,
  1603. + PICO_REGVECT_VMU0_OUT,
  1604. + PICO_REGVECT_VMU1_OUT,
  1605. + PICO_REGVECT_VMU2_OUT);
  1606. + PICO_MVRC_W(PICO_INPIX0, src0);
  1607. + PICO_MVRC_W(PICO_INPIX1, src1);
  1608. + PICO_MVRC_W(PICO_INPIX2, src2);
  1609. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1610. + PICO_MVRC_W(PICO_INPIX2, src3);
  1611. + PICO_MVRC_W(PICO_INPIX1, src4);
  1612. + PICO_MVRC_W(PICO_INPIX0, src5);
  1613. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1614. +
  1615. + PICO_LDCM_W_INC(tmp,
  1616. + PICO_REGVECT_VMU0_OUT,
  1617. + PICO_REGVECT_VMU1_OUT,
  1618. + PICO_REGVECT_VMU2_OUT);
  1619. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1620. + PICO_MVRC_W(PICO_INPIX0, src0);
  1621. + PICO_MVRC_W(PICO_INPIX1, src1);
  1622. + PICO_MVRC_W(PICO_INPIX2, src2);
  1623. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1624. +
  1625. + PICO_LDCM_W_INC(tmp,
  1626. + PICO_REGVECT_VMU0_OUT,
  1627. + PICO_REGVECT_VMU1_OUT,
  1628. + PICO_REGVECT_VMU2_OUT);
  1629. + PICO_MVRC_W(PICO_INPIX0, src1);
  1630. + PICO_MVRC_W(PICO_INPIX1, src2);
  1631. + PICO_MVRC_W(PICO_INPIX2, src3);
  1632. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1633. + PICO_MVRC_W(PICO_INPIX2, src4);
  1634. + PICO_MVRC_W(PICO_INPIX1, src5);
  1635. + PICO_MVRC_W(PICO_INPIX0, src6);
  1636. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1637. +
  1638. + PICO_LDCM_W_INC(tmp,
  1639. + PICO_REGVECT_VMU0_OUT,
  1640. + PICO_REGVECT_VMU1_OUT,
  1641. + PICO_REGVECT_VMU2_OUT);
  1642. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1643. + PICO_MVRC_W(PICO_INPIX0, src1);
  1644. + PICO_MVRC_W(PICO_INPIX1, src2);
  1645. + PICO_MVRC_W(PICO_INPIX2, src3);
  1646. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1647. +
  1648. + ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
  1649. + ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0)));
  1650. +
  1651. + dst += 2;
  1652. + src += 2;
  1653. + }
  1654. +}
  1655. +
  1656. +
  1657. +static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1658. + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1659. + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1660. + src += 4*srcStride;
  1661. + dst += 4*dstStride;
  1662. + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1663. + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1664. +}
  1665. +
  1666. +static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1667. + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1668. + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1669. + src += 4*srcStride;
  1670. + dst += 4*dstStride;
  1671. + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1672. + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1673. +}
  1674. +
  1675. +static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1676. + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1677. + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1678. + src += 4*srcStride;
  1679. + dst += 4*dstStride;
  1680. + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1681. + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1682. +}
  1683. +
  1684. +static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1685. + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1686. + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1687. + src += 4*srcStride;
  1688. + dst += 4*dstStride;
  1689. + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1690. + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1691. +}
  1692. +
  1693. +static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1694. + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1695. + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1696. + src += 4*srcStride;
  1697. + dst += 4*dstStride;
  1698. + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1699. + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1700. +}
  1701. +
  1702. +static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1703. + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1704. + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1705. + src += 4*srcStride;
  1706. + dst += 4*dstStride;
  1707. + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1708. + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1709. +}
  1710. +
  1711. +static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1712. + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1713. + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1714. + src += 8*srcStride;
  1715. + dst += 8*dstStride;
  1716. + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1717. + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1718. +}
  1719. +
  1720. +static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1721. + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1722. + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1723. + src += 8*srcStride;
  1724. + dst += 8*dstStride;
  1725. + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1726. + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1727. +}
  1728. +
  1729. +static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1730. + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1731. + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1732. + src += 8*srcStride;
  1733. + dst += 8*dstStride;
  1734. + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1735. + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1736. +}
  1737. +
  1738. +static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1739. + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1740. + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1741. + src += 8*srcStride;
  1742. + dst += 8*dstStride;
  1743. + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1744. + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1745. +}
  1746. +
  1747. +static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1748. + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1749. + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1750. + src += 8*srcStride;
  1751. + dst += 8*dstStride;
  1752. + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1753. + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1754. +}
  1755. +
  1756. +static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1757. + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1758. + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1759. + src += 8*srcStride;
  1760. + dst += 8*dstStride;
  1761. + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1762. + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1763. +}
  1764. +
  1765. +
  1766. +#define H264_MC(OPNAME, SIZE) \
  1767. +static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\
  1768. + OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
  1769. +}\
  1770. +\
  1771. +static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\
  1772. + uint8_t half[SIZE*SIZE];\
  1773. + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
  1774. + OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
  1775. +}\
  1776. +\
  1777. +static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\
  1778. + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\
  1779. +}\
  1780. +\
  1781. +static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\
  1782. + uint8_t half[SIZE*SIZE];\
  1783. + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
  1784. + OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
  1785. +}\
  1786. +\
  1787. +static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\
  1788. + uint8_t full[SIZE*(SIZE+5)];\
  1789. + uint8_t * const full_mid= full + SIZE*2;\
  1790. + uint8_t half[SIZE*SIZE];\
  1791. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1792. + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
  1793. + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
  1794. +}\
  1795. +\
  1796. +static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\
  1797. + uint8_t full[SIZE*(SIZE+5)];\
  1798. + uint8_t * const full_mid= full + SIZE*2;\
  1799. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1800. + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\
  1801. +}\
  1802. +\
  1803. +static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\
  1804. + uint8_t full[SIZE*(SIZE+5)];\
  1805. + uint8_t * const full_mid= full + SIZE*2;\
  1806. + uint8_t half[SIZE*SIZE];\
  1807. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1808. + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
  1809. + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
  1810. +}\
  1811. +\
  1812. +static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\
  1813. + uint8_t full[SIZE*(SIZE+5)];\
  1814. + uint8_t * const full_mid= full + SIZE*2;\
  1815. + uint8_t halfH[SIZE*SIZE];\
  1816. + uint8_t halfV[SIZE*SIZE];\
  1817. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
  1818. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1819. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1820. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1821. +}\
  1822. +\
  1823. +static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\
  1824. + uint8_t full[SIZE*(SIZE+5)];\
  1825. + uint8_t * const full_mid= full + SIZE*2;\
  1826. + uint8_t halfH[SIZE*SIZE];\
  1827. + uint8_t halfV[SIZE*SIZE];\
  1828. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
  1829. + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1830. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1831. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1832. +}\
  1833. +\
  1834. +static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\
  1835. + uint8_t full[SIZE*(SIZE+5)];\
  1836. + uint8_t * const full_mid= full + SIZE*2;\
  1837. + uint8_t halfH[SIZE*SIZE];\
  1838. + uint8_t halfV[SIZE*SIZE];\
  1839. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
  1840. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1841. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1842. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1843. +}\
  1844. +\
  1845. +static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\
  1846. + uint8_t full[SIZE*(SIZE+5)];\
  1847. + uint8_t * const full_mid= full + SIZE*2;\
  1848. + uint8_t halfH[SIZE*SIZE];\
  1849. + uint8_t halfV[SIZE*SIZE];\
  1850. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
  1851. + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1852. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1853. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1854. +}\
  1855. +\
  1856. +static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\
  1857. + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\
  1858. +}\
  1859. +\
  1860. +static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\
  1861. + uint8_t halfH[SIZE*SIZE];\
  1862. + uint8_t halfHV[SIZE*SIZE];\
  1863. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
  1864. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1865. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  1866. +}\
  1867. +\
  1868. +static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\
  1869. + uint8_t halfH[SIZE*SIZE];\
  1870. + uint8_t halfHV[SIZE*SIZE];\
  1871. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
  1872. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1873. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  1874. +}\
  1875. +\
  1876. +static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\
  1877. + uint8_t full[SIZE*(SIZE+5)];\
  1878. + uint8_t * const full_mid= full + SIZE*2;\
  1879. + uint8_t halfV[SIZE*SIZE];\
  1880. + uint8_t halfHV[SIZE*SIZE];\
  1881. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1882. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1883. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1884. + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  1885. +}\
  1886. +\
  1887. +static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\
  1888. + uint8_t full[SIZE*(SIZE+5)];\
  1889. + uint8_t * const full_mid= full + SIZE*2;\
  1890. + uint8_t halfV[SIZE*SIZE];\
  1891. + uint8_t halfHV[SIZE*SIZE];\
  1892. + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1893. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1894. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1895. + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  1896. +}\
  1897. +
  1898. +H264_MC(put_, 4)
  1899. +H264_MC(put_, 8)
  1900. +H264_MC(put_, 16)
  1901. +H264_MC(avg_, 4)
  1902. +H264_MC(avg_, 8)
  1903. +H264_MC(avg_, 16)
  1904. +
  1905. +
  1906. +
  1907. +#define dspfunc16(PFX) \
  1908. + void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1909. + PFX ## _pixels8_avr32(dst, pixels, line_size, h);\
  1910. + PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\
  1911. + }\
  1912. + void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1913. + PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\
  1914. + PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\
  1915. + }\
  1916. + void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1917. + PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\
  1918. + PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\
  1919. + }\
  1920. + void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1921. + PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\
  1922. + PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\
  1923. + }\
  1924. +
  1925. +
  1926. +dspfunc16(put)
  1927. +dspfunc16(put_no_rnd)
  1928. +dspfunc16(avg)
  1929. +dspfunc16(avg_no_rnd)
  1930. +#undef dspfunc16
  1931. +
  1932. +static int pix_sum_avr32(uint8_t * pix, int line_size)
  1933. +{
  1934. + int s, i;
  1935. +
  1936. + s = 0;
  1937. + for (i = 0; i < 16; i++) {
  1938. + int tmp1,tmp2,tmp3,tmp4,tmp5;
  1939. + __asm__ volatile ( "ld.w\t%0, %6[0]\n\t"
  1940. + "ld.w\t%1, %6[4]\n\t"
  1941. + "ld.w\t%2, %6[8]\n\t"
  1942. + "ld.w\t%3, %6[12]\n\t"
  1943. + "punpckub.h\t%4, %0:t\n\t"
  1944. + "padd.h\t%5, %5, %4\n\t"
  1945. + "punpckub.h\t%4, %0:b\n\t"
  1946. + "padd.h\t%5, %5, %4\n\t"
  1947. + "punpckub.h\t%4, %1:t\n\t"
  1948. + "padd.h\t%5, %5, %4\n\t"
  1949. + "punpckub.h\t%4, %1:b\n\t"
  1950. + "padd.h\t%5, %5, %4\n\t"
  1951. + "punpckub.h\t%4, %2:t\n\t"
  1952. + "padd.h\t%5, %5, %4\n\t"
  1953. + "punpckub.h\t%4, %2:b\n\t"
  1954. + "padd.h\t%5, %5, %4\n\t"
  1955. + "punpckub.h\t%4, %3:t\n\t"
  1956. + "padd.h\t%5, %5, %4\n\t"
  1957. + "punpckub.h\t%4, %3:b\n\t"
  1958. + "padd.h\t%5, %5, %4\n\t"
  1959. + : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s)
  1960. + : "r"(pix));
  1961. + pix += line_size;
  1962. + }
  1963. + __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) );
  1964. +
  1965. + return s;
  1966. +}
  1967. +
  1968. +
  1969. +//#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
  1970. +//#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
  1971. +//#define H264_WEIGHT(W,H) \
  1972. +//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
  1973. +// int attribute_unused x, y; \
  1974. +// offset <<= log2_denom; \
  1975. +// if(log2_denom) offset += 1<<(log2_denom-1); \
  1976. +// for(y=0; y<H; y++, block += stride){ \
  1977. +// uint32_t tmp0, tmp1;
  1978. +// if(W==2) { \
  1979. +// asm volatile ( "ld.ub\t%[tmp0], %[block][0]\n" \
  1980. +// "ld.ub\t%[tmp1], %[block][1]\n" \
  1981. +// "mulhh.w\t%[tmp0], %[tmp0]:b, %[weight]:b\n" \
  1982. +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
  1983. +// "asr\t%[tmp0], %[log2_denom]\n" \
  1984. +// "asr\t%[tmp1], %[log2_denom]\n" \
  1985. +// "satu\t%[tmp0] >> 0, 8\n" \
  1986. +// "satu\t%[tmp1] >> 0, 8\n" \
  1987. +// "st.b\t%[block][0], %[tmp0]\n" \
  1988. +// "st.b\t%[block][1], %[tmp1]\n" \
  1989. +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
  1990. +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
  1991. +// } else if ( W==4 ) { \
  1992. +// asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \
  1993. +// "punpckub.h\t%[tmp1], %[tmp0]:t\n" \
  1994. +// "punpckub.h\t%[tmp0], %[tmp0]:b\n" \
  1995. +// "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \
  1996. +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
  1997. +// "asr\t%[tmp0], %[log2_denom]\n" \
  1998. +// "asr\t%[tmp1], %[log2_denom]\n" \
  1999. +// "satu\t%[tmp0] >> 0, 8\n" \
  2000. +// "satu\t%[tmp1] >> 0, 8\n" \
  2001. +// "st.b\t%[block][0], %[tmp0]\n" \
  2002. +// "st.b\t%[block][1], %[tmp1]\n" \
  2003. +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
  2004. +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
  2005. +//
  2006. +//
  2007. +//
  2008. +// if(W==4) continue; \
  2009. +// op_scale1(4); \
  2010. +// op_scale1(5); \
  2011. +// op_scale1(6); \
  2012. +// op_scale1(7); \
  2013. +// if(W==8) continue; \
  2014. +// op_scale1(8); \
  2015. +// op_scale1(9); \
  2016. +// op_scale1(10); \
  2017. +// op_scale1(11); \
  2018. +// op_scale1(12); \
  2019. +// op_scale1(13); \
  2020. +// op_scale1(14); \
  2021. +// op_scale1(15); \
  2022. +// } \
  2023. +//} \
  2024. +//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
  2025. +// int attribute_unused x, y; \
  2026. +// int offset = (offsets + offsetd + 1) >> 1; \
  2027. +// offset = ((offset << 1) + 1) << log2_denom; \
  2028. +// for(y=0; y<H; y++, dst += stride, src += stride){ \
  2029. +// op_scale2(0); \
  2030. +// op_scale2(1); \
  2031. +// if(W==2) continue; \
  2032. +// op_scale2(2); \
  2033. +// op_scale2(3); \
  2034. +// if(W==4) continue; \
  2035. +// op_scale2(4); \
  2036. +// op_scale2(5); \
  2037. +// op_scale2(6); \
  2038. +// op_scale2(7); \
  2039. +// if(W==8) continue; \
  2040. +// op_scale2(8); \
  2041. +// op_scale2(9); \
  2042. +// op_scale2(10); \
  2043. +// op_scale2(11); \
  2044. +// op_scale2(12); \
  2045. +// op_scale2(13); \
  2046. +// op_scale2(14); \
  2047. +// op_scale2(15); \
  2048. +// } \
  2049. +//}
  2050. +
  2051. +
  2052. +
  2053. +/* Returns zero in each byte where the absolute difference between <a> and <b>
  2054. + is not less than <compare> */
  2055. +#define PABS_DIFF_LESS_THAN( a, b, compare) \
  2056. + ({ uint32_t __tmp__, __tmp2__, __mask__; \
  2057. + asm ( \
  2058. + /* Check ABS( a - b ) < compare */ \
  2059. + "psubs.ub\t%[tmp], %[opa], %[opb]\n" \
  2060. + "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \
  2061. + "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \
  2062. + /* This produces 0 for all bytes where the comparison is not true */ \
  2063. + "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \
  2064. + : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \
  2065. + : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \
  2066. + __mask__; })
  2067. +
  2068. +/*
  2069. + Set all bytes containing zero in <value> to 255 and the rest to zero.
  2070. +
  2071. + Add with saturation 254 to all bytes making all bytes different from
  2072. + zero become 255. Then add one without saturation to make all bytes
  2073. + originally containing zero 255 and the rest 0. */
  2074. +#define SET_ALL_BITS_IN_ZERO_BYTES(value) \
  2075. + ({ uint32_t __tmp__; \
  2076. + asm ( \
  2077. + "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \
  2078. + "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \
  2079. + : [tmp] "=r"(__tmp__) \
  2080. + : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \
  2081. + __tmp__; })
  2082. +
  2083. +#define PACKW_SH(upper, lower) \
  2084. + ({ uint32_t __tmp__; \
  2085. + asm ( \
  2086. + "packw.sh\t%[tmp], %[u], %[l]\n" \
  2087. + : [tmp] "=r"(__tmp__) \
  2088. + : [u] "r"(upper), [l] "r"(lower) ); \
  2089. + __tmp__; })
  2090. +
  2091. +#define PACKSH_UB(upper, lower) \
  2092. + ({ uint32_t __tmp__; \
  2093. + asm ( \
  2094. + "packsh.sb\t%[tmp], %[u], %[l]\n" \
  2095. + : [tmp] "=r"(__tmp__) \
  2096. + : [u] "r"(upper), [l] "r"(lower) ); \
  2097. + __tmp__; })
  2098. +
  2099. +static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  2100. +{
  2101. + int i;
  2102. +
  2103. + if ( alpha == 0 )
  2104. + return;
  2105. +
  2106. + alpha = PACKW_SH(alpha, alpha);
  2107. + alpha = PACKSH_UB(alpha, alpha);
  2108. + beta = PACKW_SH(beta, beta);
  2109. + beta = PACKSH_UB(beta, beta);
  2110. +
  2111. + for( i = 0; i < 4; i++ ) {
  2112. + uint32_t p0, p1, p2, q0, q1, q2;
  2113. + uint32_t mask, mask2;
  2114. + uint32_t tmp, tmp2, tmp3, tmp4;
  2115. +
  2116. + if( tc0[i] < 0 ) {
  2117. + pix += 4;
  2118. + continue;
  2119. + }
  2120. +
  2121. +/* for( d = 0; d < 4; d++ ) {
  2122. + const int p0 = pix[-1*stride];
  2123. + const int p1 = pix[-2*stride];
  2124. + const int p2 = pix[-3*stride];
  2125. + const int q0 = pix[0];
  2126. + const int q1 = pix[1*stride];
  2127. + const int q2 = pix[2*stride];
  2128. +
  2129. + if( ABS( p0 - q0 ) < alpha &&
  2130. + ABS( p1 - p0 ) < beta &&
  2131. + ABS( q1 - q0 ) < beta ) { */
  2132. +
  2133. + p0 = LD32(pix - stride);
  2134. + p1 = LD32(pix - 2*stride);
  2135. + q0 = LD32(pix);
  2136. + q1 = LD32(pix + stride);
  2137. +
  2138. + /* Check which of the columns should be filtered, if any. */
  2139. + mask = PABS_DIFF_LESS_THAN(p0, q0, alpha);
  2140. + mask |= PABS_DIFF_LESS_THAN(p1, p0, beta);
  2141. + mask |= PABS_DIFF_LESS_THAN(q1, q0, beta);
  2142. +
  2143. + if ( !mask )
  2144. + continue;
  2145. +
  2146. + mask = SET_ALL_BITS_IN_ZERO_BYTES(mask);
  2147. +
  2148. +
  2149. + int tc = PACKW_SH(tc0[i], tc0[i]);
  2150. + int tc0_p = tc;
  2151. + int tc0_m = PACKW_SH(-tc0[i], -tc0[i]);
  2152. +
  2153. + /*
  2154. + int i_delta;
  2155. + if( ABS( p2 - p0 ) < beta ) {
  2156. + pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
  2157. + tc++;
  2158. + }*/
  2159. +
  2160. + p2 = LD32(pix - 3*stride);
  2161. + mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask;
  2162. +
  2163. + if ( mask2 ){
  2164. + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
  2165. + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
  2166. + "paddh.ub\t%[tmp], %[tmp], %[p2]\n"
  2167. + "punpckub.h\t%[tmp2], %[tmp]:t\n"
  2168. + "punpckub.h\t%[tmp], %[tmp]:b\n"
  2169. + "punpckub.h\t%[tmp3], %[p1]:t\n"
  2170. + "punpckub.h\t%[tmp4], %[p1]:b\n"
  2171. + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2172. + "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
  2173. + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
  2174. + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
  2175. + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
  2176. + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
  2177. + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2178. + "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
  2179. + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
  2180. + "andn\t%[tmp], %[mask2]\n"
  2181. + "and\t%[tmp2], %[q1], %[mask2]\n"
  2182. + "or\t%[tmp], %[tmp2]\n"
  2183. + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
  2184. + [tmp4]"=&r"(tmp4)
  2185. + : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p),
  2186. + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
  2187. + ST32(pix - 2*stride, tmp);
  2188. + tc += 0x00010001;
  2189. + }
  2190. +
  2191. +
  2192. + q2 = LD32(pix + 2*stride);
  2193. +
  2194. + /*
  2195. + if( ABS( q2 - q0 ) < beta ) {
  2196. + pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
  2197. + tc++;
  2198. + }
  2199. + */
  2200. + mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask;
  2201. +
  2202. + if ( mask2 ){
  2203. + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
  2204. + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
  2205. + "paddh.ub\t%[tmp], %[tmp], %[q2]\n"
  2206. + "punpckub.h\t%[tmp2], %[tmp]:t\n"
  2207. + "punpckub.h\t%[tmp], %[tmp]:b\n"
  2208. + "punpckub.h\t%[tmp3], %[q1]:t\n"
  2209. + "punpckub.h\t%[tmp4], %[q1]:b\n"
  2210. + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2211. + "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
  2212. + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
  2213. + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
  2214. + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
  2215. + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
  2216. + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2217. + "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
  2218. + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
  2219. + "andn\t%[tmp], %[mask2]\n"
  2220. + "and\t%[tmp2], %[q1], %[mask2]\n"
  2221. + "or\t%[tmp], %[tmp2]\n"
  2222. + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
  2223. + [tmp4]"=&r"(tmp4)
  2224. + : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p),
  2225. + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
  2226. + ST32(pix + stride, tmp);
  2227. + tc += 0x00010001;
  2228. + }
  2229. +
  2230. + uint32_t old_p0 = p0;
  2231. + uint32_t old_q0 = q0;
  2232. +
  2233. + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
  2234. + pix[-stride] = clip_uint8( p0 + i_delta );
  2235. + pix[0] = clip_uint8( q0 - i_delta ); */
  2236. +
  2237. + asm (
  2238. + /* Check if the two upper pixels should be filtered */
  2239. + "lsr\t%[tmp], %[inv_mask], 16\n"
  2240. + "breq\t0f\n"
  2241. +
  2242. + "punpckub.h\t%[tmp], %[p1]:t\n"
  2243. + "punpckub.h\t%[tmp2], %[q1]:t\n"
  2244. +
  2245. + /* p1 - q1 */
  2246. + "psub.h\t%[tmp], %[tmp], %[tmp2]\n"
  2247. +
  2248. + "punpckub.h\t%[tmp3], %[q0]:t\n"
  2249. + "punpckub.h\t%[tmp4], %[p0]:t\n"
  2250. +
  2251. + /* q0 - p0 */
  2252. + "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n"
  2253. +
  2254. + /* (q0 - p0) << 2 */
  2255. + "plsl.h\t%[tmp2], %[tmp2], 2\n"
  2256. +
  2257. + /* ((q0 - p0) << 2) + (p1 - q1) */
  2258. + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
  2259. +
  2260. + "mov\t%[tmp], 0x00040004\n"
  2261. + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
  2262. + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
  2263. +
  2264. + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
  2265. + "pasr.h\t%[tmp2], %[tmp2], 3\n"
  2266. +
  2267. + "mov\t%[tmp], 0\n"
  2268. + "psub.h\t%[tmp], %[tmp], %[tc]\n"
  2269. +
  2270. + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
  2271. + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
  2272. + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
  2273. +
  2274. +
  2275. + /* pix[-stride] = clip_uint8( p0 + i_delta ); */
  2276. + "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n"
  2277. +
  2278. +
  2279. + /* pix[0] = clip_uint8( q0 - i_delta ); */
  2280. + "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n"
  2281. +
  2282. + /* Check if the two lower pixels should be filtered */
  2283. + "lsl\t%[tmp2], %[inv_mask], 16\n"
  2284. + "breq\t1f\n"
  2285. +
  2286. + "0:\n"
  2287. + "punpckub.h\t%[p1], %[p1]:b\n"
  2288. + "punpckub.h\t%[q1], %[q1]:b\n"
  2289. +
  2290. + /* p1 - q1 */
  2291. + "psub.h\t%[p1], %[p1], %[q1]\n"
  2292. +
  2293. + "punpckub.h\t%[q0], %[q0]:b\n"
  2294. + "punpckub.h\t%[p0], %[p0]:b\n"
  2295. +
  2296. + /* q0 - p0 */
  2297. + "psub.h\t%[tmp2], %[q0], %[p0]\n"
  2298. +
  2299. + /* (q0 - p0) << 2 */
  2300. + "plsl.h\t%[tmp2], %[tmp2], 2\n"
  2301. +
  2302. + /* ((q0 - p0) << 2) + (p1 - q1) */
  2303. + "padd.h\t%[tmp2], %[tmp2], %[p1]\n"
  2304. +
  2305. + "mov\t%[q1], 0x00040004\n"
  2306. + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
  2307. + "padd.h\t%[tmp2], %[tmp2], %[q1]\n"
  2308. +
  2309. + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
  2310. + "pasr.h\t%[tmp2], %[tmp2], 3\n"
  2311. +
  2312. + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
  2313. + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
  2314. + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
  2315. +
  2316. + /* pix[-stride] = clip_uint8( p0 + i_delta ); */
  2317. + "padd.h\t%[p0], %[p0], %[tmp2]\n"
  2318. +
  2319. + /* pix[0] = clip_uint8( q0 - i_delta ); */
  2320. + "psub.h\t%[q0], %[q0], %[tmp2]\n"
  2321. +
  2322. + "1:\n"
  2323. + "packsh.ub\t%[p0], %[tmp4], %[p0]\n"
  2324. + "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n"
  2325. +
  2326. + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
  2327. + [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1)
  2328. + : [tc]"r"(tc), [inv_mask]"r"(~mask));
  2329. +
  2330. + ST32(pix - stride, (mask & old_p0) | (p0 & ~mask));
  2331. + ST32(pix, (mask & old_q0) | (q0 & ~mask));
  2332. +
  2333. + }
  2334. + pix += 1;
  2335. +}
  2336. +
  2337. +
  2338. +
  2339. +
  2340. +#ifdef CHECK_DSP_FUNCS_AGAINST_C
  2341. +
  2342. +void dump_block8(uint8_t *block, int line_size, int h){
  2343. + int i, j;
  2344. +
  2345. + for ( i = 0; i < h ; i++ ){
  2346. + av_log(NULL, AV_LOG_ERROR, "\t");
  2347. + for ( j = 0; j < 8 ; j++ ){
  2348. + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
  2349. + }
  2350. + av_log(NULL, AV_LOG_ERROR, "\n");
  2351. + }
  2352. +}
  2353. +
  2354. +void dump_block4(uint8_t *block, int line_size, int h){
  2355. + int i, j;
  2356. +
  2357. + for ( i = 0; i < h ; i++ ){
  2358. + av_log(NULL, AV_LOG_ERROR, "\t");
  2359. + for ( j = 0; j < 4 ; j++ ){
  2360. + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
  2361. + }
  2362. + av_log(NULL, AV_LOG_ERROR, "\n");
  2363. + }
  2364. +}
  2365. +
  2366. +void dump_block(uint8_t *block, int line_size, int h, int w){
  2367. + int i, j;
  2368. +
  2369. + for ( i = 0; i < h ; i++ ){
  2370. + av_log(NULL, AV_LOG_ERROR, "\t");
  2371. + for ( j = 0; j < w ; j++ ){
  2372. + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
  2373. + }
  2374. + av_log(NULL, AV_LOG_ERROR, "\n");
  2375. + }
  2376. +}
  2377. +
  2378. +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  2379. + int h, char *name, int max_dev){
  2380. + int i,j;
  2381. + for ( i = 0; i < 8 ; i++ ){
  2382. + for ( j = 0; j < h ; j++ ){
  2383. + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
  2384. + diff = diff < 0 ? -diff : diff;
  2385. + if ( diff > max_dev ){
  2386. + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
  2387. + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
  2388. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
  2389. + dump_block8(test, line_size_test, h);
  2390. + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
  2391. + dump_block8(correct, line_size_correct, h);
  2392. + exit(1);
  2393. + }
  2394. + }
  2395. + }
  2396. +}
  2397. +
  2398. +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  2399. + int h, char *name, int max_dev){
  2400. + int i,j;
  2401. + for ( i = 0; i < 4 ; i++ ){
  2402. + for ( j = 0; j < h ; j++ ){
  2403. + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
  2404. + diff = diff < 0 ? -diff : diff;
  2405. + if ( diff > max_dev ){
  2406. + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
  2407. + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
  2408. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
  2409. + dump_block8(test, line_size_test, h);
  2410. + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
  2411. + dump_block4(correct, line_size_correct, h);
  2412. + exit(1);
  2413. + }
  2414. + }
  2415. + }
  2416. +}
  2417. +
  2418. +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  2419. + int h, int width, char *name, int max_dev){
  2420. + int i,j;
  2421. + for ( i = 0; i < width ; i++ ){
  2422. + for ( j = 0; j < h ; j++ ){
  2423. + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
  2424. + diff = diff < 0 ? -diff : diff;
  2425. + if ( diff > max_dev ){
  2426. + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
  2427. + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
  2428. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
  2429. + dump_block(test, line_size_test, h, width);
  2430. + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
  2431. + dump_block(correct, line_size_correct, h, width);
  2432. + exit(1);
  2433. + }
  2434. + }
  2435. + }
  2436. +}
  2437. +
  2438. +void dump_dct_block(DCTELEM *block){
  2439. + int i, j;
  2440. +
  2441. + for ( i = 0; i < 8 ; i++ ){
  2442. + av_log(NULL, AV_LOG_ERROR, "\t");
  2443. + for ( j = 0; j < 8 ; j++ ){
  2444. + av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]);
  2445. + }
  2446. + av_log(NULL, AV_LOG_ERROR, "\n");
  2447. + }
  2448. +}
  2449. +
  2450. +void test_idct_avr32(DCTELEM *block){
  2451. + DCTELEM testBlock[64];
  2452. + int i, j;
  2453. +
  2454. + /* Copy transposed block to testBlock */
  2455. + for ( i = 0; i < 8 ; i++ ){
  2456. + for ( j = 0; j < 8 ; j++ ){
  2457. + testBlock[i + 8*j] = block[j + i*8];
  2458. + }
  2459. + }
  2460. +
  2461. + idct_avr32(block);
  2462. + simple_idct(&testBlock);
  2463. +
  2464. + for ( i = 0; i < 64 ; i++ ){
  2465. + if ( block[i] != testBlock[i] ){
  2466. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n");
  2467. + dump_dct_block(block);
  2468. + av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n");
  2469. + dump_dct_block(testBlock);
  2470. + exit(1);
  2471. + }
  2472. + }
  2473. +}
  2474. +
  2475. +void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){
  2476. + uint8_t testBlock[64];
  2477. + DCTELEM blockCopy[64];
  2478. + int i, j;
  2479. +
  2480. + /* Copy transposed block to blockCopy */
  2481. + for ( i = 0; i < 8 ; i++ ){
  2482. + for ( j = 0; j < 8 ; j++ ){
  2483. + blockCopy[i + 8*j] = block[j + i*8];
  2484. + }
  2485. + }
  2486. +
  2487. + idct_put_avr32(dest, line_size, block);
  2488. + simple_idct_put(&testBlock, 8, blockCopy);
  2489. +
  2490. + check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1);
  2491. +}
  2492. +
  2493. +
  2494. +void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){
  2495. + uint8_t testBlock[64];
  2496. + DCTELEM blockCopy[64];
  2497. + int i, j;
  2498. +
  2499. + /* Copy dest to testBlock */
  2500. + for ( i = 0; i < 8 ; i++ ){
  2501. + for ( j = 0; j < 8 ; j++ ){
  2502. + testBlock[i + 8*j] = dest[i + j*line_size];
  2503. + }
  2504. + }
  2505. +
  2506. + /* Copy transposed block to blockCopy */
  2507. + for ( i = 0; i < 8 ; i++ ){
  2508. + for ( j = 0; j < 8 ; j++ ){
  2509. + blockCopy[i + 8*j] = block[j + i*8];
  2510. + }
  2511. + }
  2512. +
  2513. + idct_add_avr32(dest, line_size, block);
  2514. + simple_idct_add(&testBlock, 8, blockCopy);
  2515. +
  2516. + check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1);
  2517. +}
  2518. +
  2519. +void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
  2520. + uint8_t testBlock[16];
  2521. + DCTELEM blockCopy[16];
  2522. + int i, j;
  2523. +
  2524. + /* Copy dest to testBlock */
  2525. + for ( i = 0; i < 4 ; i++ ){
  2526. + for ( j = 0; j < 4 ; j++ ){
  2527. + testBlock[i + 4*j] = dest[i + j*stride];
  2528. + }
  2529. + }
  2530. +
  2531. + /* Copy transposed block to blockCopy */
  2532. + for ( i = 0; i < 16 ; i++ ){
  2533. + blockCopy[i] = block[i];
  2534. + }
  2535. +
  2536. + ff_h264_idct_add_c(dest, block, stride);
  2537. +
  2538. + h264_idct_add_avr32(testBlock, blockCopy, 4);
  2539. +
  2540. + check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0);
  2541. +}
  2542. +
  2543. +void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
  2544. + uint8_t testBlock[8*8];
  2545. + DCTELEM blockCopy[8*8];
  2546. + int i, j;
  2547. +
  2548. + /* Copy dest to testBlock */
  2549. + for ( i = 0; i < 8 ; i++ ){
  2550. + for ( j = 0; j < 8 ; j++ ){
  2551. + testBlock[i + 8*j] = dest[i + j*stride];
  2552. + }
  2553. + }
  2554. +
  2555. + /* Copy source block to blockCopy */
  2556. + for ( i = 0; i < 8*8 ; i++ ){
  2557. + blockCopy[i] = block[i];
  2558. + }
  2559. +
  2560. + ff_h264_idct8_add_c(dest, block, stride);
  2561. + h264_idct8_add_avr32(testBlock, blockCopy, 8);
  2562. +
  2563. + check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0);
  2564. +}
  2565. +
  2566. +void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block,
  2567. + const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){
  2568. + uint8_t *testBlock, *testBlock2;
  2569. + int i, j;
  2570. + int input_v_size = h + in_v_size;
  2571. + int input_h_size = 8 + in_h_size;
  2572. +
  2573. + testBlock = alloca(input_h_size*input_v_size);
  2574. + testBlock2 = alloca(input_h_size*input_v_size);
  2575. +
  2576. + for ( i = 0; i < input_h_size ; i++ ){
  2577. + for ( j = 0; j < input_v_size ; j++ ){
  2578. + testBlock[i + input_h_size*j] = pixels[i + j*line_size];
  2579. + }
  2580. + }
  2581. +
  2582. + test(block, pixels, line_size, h);
  2583. + correct(testBlock2, testBlock, input_h_size, h);
  2584. +
  2585. + check_block8(block, testBlock2, line_size, input_h_size, h, name, 0);
  2586. +
  2587. +}
  2588. +
  2589. +void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst,
  2590. + uint8_t *src, int stride, int h, int w, int x, int y, char *name){
  2591. + uint8_t *testBlock, *testBlock2;
  2592. + int i, j;
  2593. + int input_v_size = h + 1;
  2594. + int input_h_size = ((w + 1) + 3) & ~3;
  2595. +
  2596. + testBlock = alloca(input_h_size*input_v_size);
  2597. + testBlock2 = alloca(input_h_size*input_v_size);
  2598. +
  2599. + for ( i = 0; i < w + 1 ; i++ ){
  2600. + for ( j = 0; j < h + 1 ; j++ ){
  2601. + testBlock[i + input_h_size*j] = src[i + j*stride];
  2602. + }
  2603. + }
  2604. +
  2605. + for ( i = 0; i < w ; i++ ){
  2606. + for ( j = 0; j < h ; j++ ){
  2607. + testBlock2[i + input_h_size*j] = dst[i + j*stride];
  2608. + }
  2609. + }
  2610. +
  2611. + test(dst, src, stride, h, x, y);
  2612. + correct(testBlock2, testBlock, input_h_size, h, x, y);
  2613. +
  2614. + check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0);
  2615. +
  2616. +}
  2617. +
  2618. +void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst,
  2619. + uint8_t *src, int stride, int size, char *name){
  2620. + uint8_t *testBlock, *testBlock2;
  2621. + int i, j;
  2622. + int test_stride = size + 8;
  2623. +
  2624. + testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4;
  2625. + testBlock2 = alloca(test_stride*size);
  2626. +
  2627. + for ( i = -4; i < size+4 ; i++ ){
  2628. + for ( j = -4; j < size+4 ; j++ ){
  2629. + testBlock[i + test_stride*j] = src[i + j*stride];
  2630. + }
  2631. + }
  2632. +
  2633. + for ( i = 0; i < size ; i++ ){
  2634. + for ( j = 0; j < size ; j++ ){
  2635. + testBlock2[i + test_stride*j] = dst[i + j*stride];
  2636. + }
  2637. + }
  2638. +
  2639. + correct(dst, src, stride);
  2640. + test(testBlock2, testBlock, test_stride);
  2641. +
  2642. + check_block(testBlock2, dst, test_stride, stride, size, size, name, 0);
  2643. +
  2644. +}
  2645. +
  2646. +
  2647. +#define test_pixels_funcs(PFX, NUM ) \
  2648. +void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2649. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \
  2650. + block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \
  2651. +void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2652. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \
  2653. + block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \
  2654. +void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2655. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \
  2656. + block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \
  2657. +void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2658. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \
  2659. + block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); }
  2660. +
  2661. +test_pixels_funcs(put, 8);
  2662. +test_pixels_funcs(put_no_rnd, 8);
  2663. +test_pixels_funcs(put, 16);
  2664. +test_pixels_funcs(put_no_rnd, 16);
  2665. +
  2666. +test_pixels_funcs(avg, 8);
  2667. +test_pixels_funcs(avg_no_rnd, 8);
  2668. +test_pixels_funcs(avg, 16);
  2669. +test_pixels_funcs(avg_no_rnd, 16);
  2670. +
  2671. +#define test_h264_chroma_mc_funcs(PFX, NUM ) \
  2672. +void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \
  2673. + test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \
  2674. + dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \
  2675. +
  2676. +test_h264_chroma_mc_funcs(put, 2);
  2677. +test_h264_chroma_mc_funcs(put, 4);
  2678. +test_h264_chroma_mc_funcs(put, 8);
  2679. +test_h264_chroma_mc_funcs(avg, 2);
  2680. +test_h264_chroma_mc_funcs(avg, 4);
  2681. +test_h264_chroma_mc_funcs(avg, 8);
  2682. +
  2683. +#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \
  2684. +void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \
  2685. + test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \
  2686. + dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); }
  2687. +
  2688. +#define test_qpel_mc_funcs(PFX, NUM) \
  2689. + test_qpel_mc_funcs_type(PFX, NUM, mc00);\
  2690. + test_qpel_mc_funcs_type(PFX, NUM, mc10);\
  2691. + test_qpel_mc_funcs_type(PFX, NUM, mc20);\
  2692. + test_qpel_mc_funcs_type(PFX, NUM, mc30);\
  2693. + test_qpel_mc_funcs_type(PFX, NUM, mc01);\
  2694. + test_qpel_mc_funcs_type(PFX, NUM, mc11);\
  2695. + test_qpel_mc_funcs_type(PFX, NUM, mc21);\
  2696. + test_qpel_mc_funcs_type(PFX, NUM, mc31);\
  2697. + test_qpel_mc_funcs_type(PFX, NUM, mc02);\
  2698. + test_qpel_mc_funcs_type(PFX, NUM, mc12);\
  2699. + test_qpel_mc_funcs_type(PFX, NUM, mc22);\
  2700. + test_qpel_mc_funcs_type(PFX, NUM, mc32);\
  2701. + test_qpel_mc_funcs_type(PFX, NUM, mc03);\
  2702. + test_qpel_mc_funcs_type(PFX, NUM, mc13);\
  2703. + test_qpel_mc_funcs_type(PFX, NUM, mc23);\
  2704. + test_qpel_mc_funcs_type(PFX, NUM, mc33)
  2705. +
  2706. +test_qpel_mc_funcs(put_h264_qpel, 4);
  2707. +test_qpel_mc_funcs(put_h264_qpel, 8);
  2708. +test_qpel_mc_funcs(put_h264_qpel, 16);
  2709. +test_qpel_mc_funcs(avg_h264_qpel, 4);
  2710. +test_qpel_mc_funcs(avg_h264_qpel, 8);
  2711. +test_qpel_mc_funcs(avg_h264_qpel, 16);
  2712. +
  2713. +
  2714. +#define dspfunc(PFX, IDX, NUM) \
  2715. + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
  2716. + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
  2717. + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
  2718. + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
  2719. + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
  2720. + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
  2721. + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
  2722. + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
  2723. + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
  2724. + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
  2725. + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
  2726. + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
  2727. + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
  2728. + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
  2729. + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
  2730. + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
  2731. +
  2732. +#endif
  2733. +
  2734. +void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx)
  2735. +{
  2736. +
  2737. + /* H264 */
  2738. +
  2739. + if ( 0 /*avr32_use_pico*/ ){
  2740. + c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico);
  2741. + c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico);
  2742. + c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico);
  2743. +
  2744. + c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico);
  2745. + c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico);
  2746. + c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico);
  2747. + }
  2748. +
  2749. +#define dspfunc(PFX, IDX, NUM) \
  2750. + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
  2751. + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
  2752. + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
  2753. + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
  2754. + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
  2755. + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
  2756. + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
  2757. + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
  2758. + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
  2759. + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
  2760. + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
  2761. + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
  2762. + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
  2763. + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
  2764. + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
  2765. + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
  2766. +
  2767. + if ( avr32_use_pico ){
  2768. + dspfunc(put_h264_qpel, 0, 16);
  2769. + dspfunc(put_h264_qpel, 1, 8);
  2770. + dspfunc(put_h264_qpel, 2, 4);
  2771. + dspfunc(avg_h264_qpel, 0, 16);
  2772. + dspfunc(avg_h264_qpel, 1, 8);
  2773. + dspfunc(avg_h264_qpel, 2, 4);
  2774. + }
  2775. +
  2776. + c->idct_put= DSP_FUNC_NAME(idct_put_avr32);
  2777. + c->idct_add= DSP_FUNC_NAME(idct_add_avr32);
  2778. + c->idct = DSP_FUNC_NAME(idct_avr32);
  2779. + c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32);
  2780. + c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32);
  2781. +
  2782. + /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/
  2783. +
  2784. + c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
  2785. +
  2786. + c->fdct = fdct_avr32;
  2787. +
  2788. + c->clear_blocks = clear_blocks_avr32;
  2789. +
  2790. +#undef dspfunc
  2791. +#define dspfunc(PFX, IDX, NUM) \
  2792. + c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \
  2793. + c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \
  2794. + c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \
  2795. + c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32)
  2796. +
  2797. + dspfunc(put, 0, 16);
  2798. + dspfunc(put_no_rnd, 0, 16);
  2799. + dspfunc(put, 1, 8);
  2800. + dspfunc(put_no_rnd, 1, 8);
  2801. +
  2802. + dspfunc(avg, 1, 8);
  2803. + dspfunc(avg_no_rnd, 1, 8);
  2804. + dspfunc(avg, 0, 16);
  2805. + dspfunc(avg_no_rnd, 0, 16);
  2806. +#undef dspfunc
  2807. +
  2808. +}
  2809. +
  2810. +
  2811. +
  2812. +#if 0
  2813. +int main(int argc, char *argv[]){
  2814. +
  2815. +
  2816. +}
  2817. +#endif
  2818. +
  2819. --- /dev/null
  2820. +++ b/libavcodec/avr32/fdct.S
  2821. @@ -0,0 +1,541 @@
  2822. +/*
  2823. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  2824. + *
  2825. + * Redistribution and use in source and binary forms, with or without
  2826. + * modification, are permitted provided that the following conditions
  2827. + * are met:
  2828. + *
  2829. + * 1. Redistributions of source code must retain the above copyright
  2830. + * notice, this list of conditions and the following disclaimer.
  2831. + *
  2832. + * 2. Redistributions in binary form must reproduce the above
  2833. + * copyright notice, this list of conditions and the following
  2834. + * disclaimer in the documentation and/or other materials provided
  2835. + * with the distribution.
  2836. + *
  2837. + * 3. The name of ATMEL may not be used to endorse or promote products
  2838. + * derived from this software without specific prior written
  2839. + * permission.
  2840. + *
  2841. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  2842. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  2843. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  2844. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  2845. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  2846. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  2847. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  2848. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  2849. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  2850. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  2851. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  2852. + * DAMAGE.
  2853. + */
  2854. +
  2855. +//**********************************************************
  2856. +//* 2-D fDCT, Based on: *
  2857. +//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical *
  2858. +//* Fast 1-D DCT Algorithms with 11 Multiplications", *
  2859. +//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal *
  2860. +//* Processing 1989 (ICASSP '89), pp. 988-991. *
  2861. +//* *
  2862. +//* Fixed point implementation optimized for the AVR-II *
  2863. +//* instruction set. If a table is used for the *
  2864. +//* coeffisients we can load two and two of them from *
  2865. +//* This will give a reduction of
  2866. +//* *
  2867. +//* *
  2868. +//**********************************************************
  2869. +
  2870. +
  2871. +/* This routine is a slow-but-accurate integer implementation of the
  2872. + * forward DCT (Discrete Cosine Transform). Taken from the IJG software
  2873. + *
  2874. + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  2875. + * on each column. Direct algorithms are also available, but they are
  2876. + * much more complex and seem not to be any faster when reduced to code.
  2877. + *
  2878. + * This implementation is based on an algorithm described in
  2879. + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  2880. + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  2881. + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  2882. + * The primary algorithm described there uses 11 multiplies and 29 adds.
  2883. + * We use their alternate method with 12 multiplies and 32 adds.
  2884. + * The advantage of this method is that no data path contains more than one
  2885. + * multiplication; this allows a very simple and accurate implementation in
  2886. + * scaled fixed-point arithmetic, with a minimal number of shifts.
  2887. + *
  2888. + * The poop on this scaling stuff is as follows:
  2889. + *
  2890. + * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
  2891. + * larger than the true DCT outputs. The final outputs are therefore
  2892. + * a factor of N larger than desired; since N=8 this can be cured by
  2893. + * a simple right shift at the end of the algorithm. The advantage of
  2894. + * this arrangement is that we save two multiplications per 1-D DCT,
  2895. + * because the y0 and y4 outputs need not be divided by sqrt(N).
  2896. + * In the IJG code, this factor of 8 is removed by the quantization step
  2897. + * (in jcdctmgr.c), here it is removed.
  2898. + *
  2899. + * We have to do addition and subtraction of the integer inputs, which
  2900. + * is no problem, and multiplication by fractional constants, which is
  2901. + * a problem to do in integer arithmetic. We multiply all the constants
  2902. + * by CONST_SCALE and convert them to integer constants (thus retaining
  2903. + * CONST_BITS bits of precision in the constants). After doing a
  2904. + * multiplication we have to divide the product by CONST_SCALE, with proper
  2905. + * rounding, to produce the correct output. This division can be done
  2906. + * cheaply as a right shift of CONST_BITS bits. We postpone shifting
  2907. + * as long as possible so that partial sums can be added together with
  2908. + * full fractional precision.
  2909. + *
  2910. + * The outputs of the first pass are scaled up by PASS1_BITS bits so that
  2911. + * they are represented to better-than-integral precision. These outputs
  2912. + * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word
  2913. + * with the recommended scaling. (For 12-bit sample data, the intermediate
  2914. + * array is INT32 anyway.)
  2915. + *
  2916. + * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  2917. + * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis
  2918. + * shows that the values given below are the most effective.
  2919. + *
  2920. + * We can gain a little more speed, with a further compromise in accuracy,
  2921. + * by omitting the addition in a descaling shift. This yields an incorrectly
  2922. + * rounded result half the time...
  2923. + */
  2924. +
  2925. + .global fdct_avr32
  2926. +
  2927. +
  2928. +
  2929. +#define CONST_BITS 13
  2930. +#define PASS1_BITS 2
  2931. +
  2932. +#define FIX_0_298631336 2446 /* FIX(0.298631336) */
  2933. +#define FIX_0_390180644 3196 /* FIX(0.390180644) */
  2934. +#define FIX_0_541196100 4433 /* FIX(0.541196100) */
  2935. +#define FIX_0_765366865 6270 /* FIX(0.765366865) */
  2936. +#define FIX_0_899976223 7373 /* FIX(0.899976223) */
  2937. +#define FIX_1_175875602 9633 /* FIX(1.175875602) */
  2938. +#define FIX_1_501321110 12299 /* FIX(1.501321110) */
  2939. +#define FIX_1_847759065 15137 /* FIX(1.847759065) */
  2940. +#define FIX_1_961570560 16069 /* FIX(1.961570560) */
  2941. +#define FIX_2_053119869 16819 /* FIX(2.053119869) */
  2942. +#define FIX_2_562915447 20995 /* FIX(2.562915447) */
  2943. +#define FIX_3_072711026 25172 /* FIX(3.072711026) */
  2944. +
  2945. +
  2946. +/*
  2947. + * Perform an integer forward DCT on one block of samples.
  2948. + */
  2949. +
  2950. +//void
  2951. +//fdct_int32(short *const block)
  2952. +//{
  2953. +// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  2954. +// int tmp10, tmp11, tmp12, tmp13;
  2955. +// int z1, z2, z3, z4, z5;
  2956. +// short *blkptr;
  2957. +// int *dataptr;
  2958. +// int data[64];
  2959. +// int i;
  2960. +//
  2961. +// /* Pass 1: process rows. */
  2962. +// /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2963. +// /* furthermore, we scale the results by 2**PASS1_BITS. */
  2964. +//
  2965. +// dataptr = data;
  2966. +// blkptr = block;
  2967. +
  2968. + .text
  2969. +fdct_avr32:
  2970. + pushm r0-r3, r4-r7, lr
  2971. +#define loop_ctr r0
  2972. +#define blkptr r12
  2973. +#define x0 r1
  2974. +#define x1 r2
  2975. +#define x2 r3
  2976. +#define x3 r4
  2977. +#define x4 r5
  2978. +#define x5 r6
  2979. +#define x6 r7
  2980. +#define x7 r8
  2981. +#define tmp0 r5
  2982. +#define tmp7 r2
  2983. +#define tmp1 r3
  2984. +#define tmp6 r4
  2985. +#define tmp2 r9
  2986. +#define tmp5 r8
  2987. +#define tmp3 r7
  2988. +#define tmp4 r6
  2989. +
  2990. +
  2991. + mov loop_ctr, 8
  2992. +// for (i = 0; i < 8; i++) {
  2993. +ROW_LOOP:
  2994. +
  2995. + ldm blkptr, r1, r2, r3, r4
  2996. +
  2997. +// tmp2 = blkptr[2] + blkptr[5];
  2998. +// tmp3 = blkptr[3] + blkptr[4];
  2999. + paddx.h r5, r3, r2
  3000. +// tmp5 = blkptr[2] - blkptr[5];
  3001. +// tmp4 = blkptr[3] - blkptr[4];
  3002. + psubx.h r6, r3, r2
  3003. +// tmp0 = blkptr[0] + blkptr[7];
  3004. +// tmp1 = blkptr[1] + blkptr[6];
  3005. + paddx.h r2, r4, r1
  3006. +// tmp7 = blkptr[0] - blkptr[7];
  3007. +// tmp6 = blkptr[1] - blkptr[6];
  3008. + psubx.h r3, r4, r1
  3009. +
  3010. +// /* Even part per LL&M figure 1 --- note that published figure is faulty;
  3011. +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  3012. +// */
  3013. +
  3014. +#define tmp10 r1
  3015. +#define tmp13 r5
  3016. +#define tmp11 r7
  3017. +#define tmp12 r3
  3018. +#define z1 r9
  3019. +
  3020. +// tmp10 = tmp0 + tmp3;
  3021. +// tmp13 = tmp0 - tmp3;
  3022. + paddsub.h r1, r2:t, r5:b
  3023. +// tmp11 = tmp1 + tmp2;
  3024. +// tmp12 = tmp1 - tmp2;
  3025. + paddsub.h r4, r2:b, r5:t
  3026. +
  3027. +
  3028. +// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
  3029. +// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;
  3030. + paddsub.h r7, r1:t, r4:t
  3031. + ld.w r10, pc[const_table - .]
  3032. + plsl.h r7, r7, PASS1_BITS
  3033. +
  3034. +// z1 = (tmp12 + tmp13) * FIX_0_541196100;
  3035. + addhh.w r8, r4:b, r1:b
  3036. + mulhh.w r8, r8:b, r10:t
  3037. +
  3038. +// dataptr[2] =
  3039. +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
  3040. +// dataptr[6] =
  3041. +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);
  3042. + mulhh.w r9, r1:b, r10:b
  3043. + ld.w r10, pc[const_table - . + 4]
  3044. + add r1, r8, r9
  3045. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  3046. +
  3047. + mulhh.w r9, r4:b, r10:t
  3048. + add r4, r8, r9
  3049. + satrnds r4 >> (CONST_BITS - PASS1_BITS), 31
  3050. +
  3051. +
  3052. +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  3053. +// * cK represents cos(K*pi/16).
  3054. +// * i0..i3 in the paper are tmp4..tmp7 here.
  3055. +// */
  3056. +
  3057. +#define z2 r5
  3058. +#define z3 r6
  3059. +#define z4 r7
  3060. +#define z5 r8
  3061. +
  3062. +// z4 = tmp5 + tmp7;
  3063. +// z3 = tmp4 + tmp6;
  3064. + padd.h r2, r6, r3
  3065. +// z2 = tmp5 + tmp6;
  3066. +// z1 = tmp4 + tmp7;
  3067. + paddx.h r5, r6, r3
  3068. +
  3069. + lddpc r9, pc[const_table - . + 8]
  3070. +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
  3071. + addhh.w r8, r2:t, r2:b
  3072. + mulhh.w r8, r8:b, r10:b
  3073. + lddpc r10, pc[const_table - . + 12]
  3074. +
  3075. +
  3076. +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
  3077. + mulhh.w r11, r6:b, r9:t
  3078. +
  3079. +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
  3080. + mulhh.w r6, r6:t, r9:b
  3081. +
  3082. +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
  3083. + lddpc r9, pc[const_table - . + 20]
  3084. + mulhh.w lr, r3:b, r10:t
  3085. +
  3086. +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
  3087. + mulhh.w r3, r3:t, r10:b
  3088. +
  3089. +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
  3090. + mulhh.w r10, r2:b, r9:t
  3091. +
  3092. +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
  3093. + mulhh.w r2, r2:t, r9:b
  3094. + lddpc r9, pc[const_table - . + 16]
  3095. +// z3 += z5;
  3096. +// z4 += z5;
  3097. + add r10, r8
  3098. + add r2, r8
  3099. +
  3100. +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
  3101. + mulhh.w r8, r5:b, r9:t
  3102. +
  3103. +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
  3104. + mulhh.w r5, r5:t, r9:b
  3105. +
  3106. +// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
  3107. + add r11, r8
  3108. + add r11, r10
  3109. + satrnds r11 >> (CONST_BITS - PASS1_BITS), 31
  3110. +
  3111. +// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
  3112. + add r6, r5
  3113. +
  3114. + sthh.w blkptr[6*2], r4:b, r11:b
  3115. + add r6, r2
  3116. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  3117. +
  3118. +// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
  3119. + add lr, r5
  3120. + sthh.w blkptr[4*2], r7:b, r6:b
  3121. + add lr, r10
  3122. + satrnds lr >> (CONST_BITS - PASS1_BITS), 31
  3123. +
  3124. +// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
  3125. + add r3, r8
  3126. + sthh.w blkptr[2*2], r1:b, lr:b
  3127. + add r3, r2
  3128. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  3129. +
  3130. +
  3131. +
  3132. +// dataptr += 8; /* advance pointer to next row */
  3133. +// blkptr += 8;
  3134. + sthh.w blkptr[0], r7:t, r3:b
  3135. + sub blkptr, -16
  3136. + sub loop_ctr, 1
  3137. + brne ROW_LOOP
  3138. +
  3139. +// }
  3140. +
  3141. + /* Pass 2: process columns.
  3142. + * We remove the PASS1_BITS scaling, but leave the results scaled up
  3143. + * by an overall factor of 8.
  3144. + */
  3145. +
  3146. +// dataptr = data;
  3147. + sub blkptr, 128
  3148. +
  3149. + mov loop_ctr, 4
  3150. +// for (i = 0; i < 8; i++) {
  3151. +COLOUMN_LOOP:
  3152. + ld.w r1, blkptr[0]
  3153. + ld.w r2, blkptr[1*8*2]
  3154. + ld.w r3, blkptr[2*8*2]
  3155. + ld.w r4, blkptr[3*8*2]
  3156. + ld.w r5, blkptr[4*8*2]
  3157. + ld.w r6, blkptr[5*8*2]
  3158. + ld.w r7, blkptr[6*8*2]
  3159. + ld.w r8, blkptr[7*8*2]
  3160. +
  3161. +// tmp0 = blkptr[0] + blkptr[7*8];
  3162. + padds.sh r9, r1, r8
  3163. +// tmp7 = blkptr[0] - blkptr[7*8];
  3164. + psubs.sh r1, r1, r8
  3165. +// tmp1 = blkptr[1*8] + blkptr[6*8];
  3166. + padds.sh r8, r2, r7
  3167. +// tmp6 = blkptr[1*8] - blkptr[6*8];
  3168. + psubs.sh r2, r2, r7
  3169. +// tmp2 = blkptr[2*8] + blkptr[5*8];
  3170. + padds.sh r7, r3, r6
  3171. +// tmp5 = blkptr[2*8] - blkptr[5*8];
  3172. + psubs.sh r3, r3, r6
  3173. +// tmp3 = blkptr[3*8] + blkptr[4*8];
  3174. + padds.sh r6, r4, r5
  3175. +// tmp4 = blkptr[3*8] - blkptr[4*8];
  3176. + psubs.sh r4, r4, r5
  3177. +
  3178. +// /* even part per ll&m figure 1 --- note that published figure is faulty;
  3179. +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  3180. +// */
  3181. +//
  3182. +// tmp10 = tmp0 + tmp3;
  3183. + padds.sh r5, r9, r6
  3184. +// tmp13 = tmp0 - tmp3;
  3185. + psubs.sh r9, r9, r6
  3186. +// tmp11 = tmp1 + tmp2;
  3187. + padds.sh r6, r8, r7
  3188. +// tmp12 = tmp1 - tmp2;
  3189. + psubs.sh r8, r8, r7
  3190. +
  3191. +// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS);
  3192. +// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS);
  3193. +//Might get an overflow here
  3194. + padds.sh r7, r5, r6
  3195. + psubs.sh r5, r5, r6
  3196. +
  3197. + //Rounding
  3198. + mov lr, (1 << (PASS1_BITS + 2))
  3199. + orh lr, hi(1 << (16 + PASS1_BITS + 2))
  3200. + padds.sh r7, r7, lr
  3201. + padds.sh r5, r5, lr
  3202. +
  3203. + pasr.h r7, r7, PASS1_BITS + 3
  3204. + pasr.h r5, r5, PASS1_BITS + 3
  3205. + st.w r12[0], r7
  3206. + st.w r12[4*8*2], r5
  3207. +
  3208. + lddpc r10, const_table2
  3209. +
  3210. +
  3211. +// z1 = (tmp12 + tmp13) * FIX_0_541196100;
  3212. + padds.sh r5, r8, r9
  3213. + mulhh.w r6, r5:t, r10:t
  3214. + mulhh.w r7, r5:b, r10:t
  3215. +
  3216. +// dataptr[16] =
  3217. +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
  3218. + lddpc r11, const_table2 + 4
  3219. + mulhh.w lr, r9:t, r10:b
  3220. + mulhh.w r9, r9:b, r10:b
  3221. + add lr, r6
  3222. + add r9, r7
  3223. + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
  3224. + satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31
  3225. + sthh.w r12[2*8*2], lr:b, r9:b
  3226. +
  3227. +// dataptr[48] =
  3228. +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);
  3229. + mulhh.w lr, r8:t, r11:t
  3230. + mulhh.w r8, r8:b, r11:t
  3231. + add lr, r6
  3232. + add r8, r7
  3233. + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
  3234. + satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31
  3235. + sthh.w r12[6*8*2], lr:b, r8:b
  3236. +
  3237. +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  3238. +// * cK represents cos(K*pi/16).
  3239. +// * i0..i3 in the paper are tmp4..tmp7 here.
  3240. +// */
  3241. +//
  3242. +// z2 = tmp5 + tmp6;
  3243. +// z3 = tmp4 + tmp6;
  3244. +// z4 = tmp5 + tmp7;
  3245. + padds.sh r5, r3, r2
  3246. + padds.sh r6, r4, r2
  3247. + padds.sh r7, r3, r1
  3248. +
  3249. +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
  3250. + padds.sh r8, r6, r7
  3251. + mulhh.w r9, r8:t, r11:b
  3252. + mulhh.w r8, r8:b, r11:b
  3253. +
  3254. +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
  3255. +// z3 += z5;
  3256. + lddpc r11, const_table2 + 8
  3257. + mulhh.w r10, r6:t, r11:t
  3258. + mulhh.w r6, r6:b, r11:t
  3259. + add r10, r9
  3260. + add r6, r8
  3261. +
  3262. +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
  3263. +// z4 += z5;
  3264. + mulhh.w lr, r7:t, r11:b
  3265. + mulhh.w r7, r7:b, r11:b
  3266. + lddpc r11, const_table2 + 12
  3267. + st.w --sp,r0
  3268. + add lr, r9
  3269. + add r7, r8
  3270. +
  3271. +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
  3272. + mulhh.w r0, r2:t, r11:t
  3273. + machh.w r0, r5:t, r11:b
  3274. + mulhh.w r2, r2:b, r11:t
  3275. + machh.w r2, r5:b, r11:b
  3276. +
  3277. +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
  3278. +// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
  3279. + add r0, r10
  3280. + lddpc r11, const_table2 + 16
  3281. + add r2, r6
  3282. + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
  3283. + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
  3284. + sthh.w r12[3*8*2], r0:b, r2:b
  3285. +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
  3286. + mulhh.w r0, r3:t, r11:t
  3287. + machh.w r0, r5:t, r11:b
  3288. + mulhh.w r2, r3:b, r11:t
  3289. + machh.w r2, r5:b, r11:b
  3290. + add r0, lr
  3291. + lddpc r11, const_table2 + 20
  3292. + add r2, r7
  3293. +
  3294. +// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
  3295. + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
  3296. + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
  3297. + sthh.w r12[5*8*2], r0:b, r2:b
  3298. +
  3299. +
  3300. +// z1 = tmp4 + tmp7;
  3301. + padds.sh r2, r4, r1
  3302. +
  3303. +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
  3304. + mulhh.w r3, r4:t, r11:t
  3305. + machh.w r3, r2:t, r11:b
  3306. + mulhh.w r4, r4:b, r11:t
  3307. + machh.w r4, r2:b, r11:b
  3308. + add r3, r10
  3309. + lddpc r11, const_table2 + 24
  3310. + add r4, r6
  3311. +
  3312. +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
  3313. +// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
  3314. + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
  3315. + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
  3316. + sthh.w r12[7*8*2], r3:b, r4:b
  3317. +
  3318. +
  3319. +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
  3320. + mulhh.w r3, r1:t, r11:t
  3321. + machh.w r3, r2:t, r11:b
  3322. + mulhh.w r4, r1:b, r11:t
  3323. + machh.w r4, r2:b, r11:b
  3324. + add r3, lr
  3325. + add r4, r7
  3326. +
  3327. +// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
  3328. + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
  3329. + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
  3330. + sthh.w r12[1*8*2], r3:b, r4:b
  3331. + ld.w r0, sp++
  3332. +
  3333. +// dataptr++; /* advance pointer to next column */
  3334. + sub blkptr, -4
  3335. + sub loop_ctr, 1
  3336. + brne COLOUMN_LOOP
  3337. +
  3338. +// }
  3339. +
  3340. + popm r0-r3, r4-r7, pc
  3341. +
  3342. +// /* descale */
  3343. +// for (i = 0; i < 64; i++)
  3344. +// block[i] = (short int) DESCALE(data[i], 3);
  3345. +
  3346. +
  3347. +//}
  3348. +
  3349. +
  3350. + .align 2
  3351. +const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
  3352. + .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110
  3353. + .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644
  3354. +
  3355. +const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
  3356. + .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447
  3357. + .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223
  3358. + .short FIX_1_501321110, -FIX_0_899976223
  3359. +
  3360. +
  3361. +
  3362. +
  3363. --- /dev/null
  3364. +++ b/libavcodec/avr32/h264idct.S
  3365. @@ -0,0 +1,451 @@
  3366. +/*
  3367. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  3368. + *
  3369. + * Redistribution and use in source and binary forms, with or without
  3370. + * modification, are permitted provided that the following conditions
  3371. + * are met:
  3372. + *
  3373. + * 1. Redistributions of source code must retain the above copyright
  3374. + * notice, this list of conditions and the following disclaimer.
  3375. + *
  3376. + * 2. Redistributions in binary form must reproduce the above
  3377. + * copyright notice, this list of conditions and the following
  3378. + * disclaimer in the documentation and/or other materials provided
  3379. + * with the distribution.
  3380. + *
  3381. + * 3. The name of ATMEL may not be used to endorse or promote products
  3382. + * derived from this software without specific prior written
  3383. + * permission.
  3384. + *
  3385. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  3386. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  3387. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  3388. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  3389. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  3390. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  3391. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  3392. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  3393. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  3394. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  3395. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  3396. + * DAMAGE.
  3397. + */
  3398. +
  3399. + .global h264_idct_add_avr32
  3400. +
  3401. + /* Macro for performing the 1-D transform on one row line.
  3402. +
  3403. + The register 'w01' should contain the first two pixels,
  3404. + and the register 'w23' should contain the last two pixels
  3405. + in the line. The resulting line is placed in p01 and p23
  3406. + so that { w01, w23 } = { x0, x1, x3, x2 }.
  3407. + 'tmp' and 'tmp2' should be scratchpad registers. */
  3408. + .macro transform_row w01, w23, tmp, tmp2
  3409. + add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */
  3410. + sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */
  3411. + bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */
  3412. + pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */
  3413. + paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */
  3414. + padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */
  3415. + psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */
  3416. + .endm
  3417. +
  3418. + /* Macro for performing the 1-D transform on two columns.
  3419. +
  3420. + The registers w0, w1, w2, w3 should each contain two
  3421. + packed samples from the two colomns to transform.
  3422. + tmp and tmp2 are scratchpad registers.
  3423. +
  3424. + The resulting transformed columns are placed in the
  3425. + same positions as the input columns.
  3426. + */
  3427. + .macro transform_2columns w0, w1, w2, w3, tmp, tmp2
  3428. + padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */
  3429. + psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */
  3430. + pasr.h \w2, \w1, 1 /* w2 = w1/2 */
  3431. + pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */
  3432. + psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */
  3433. + padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */
  3434. + padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */
  3435. + psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */
  3436. + padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */
  3437. + psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */
  3438. + /* Scale down result. */
  3439. + pasr.h \w0, \w0, 6
  3440. + pasr.h \w1, \w1, 6
  3441. + pasr.h \w2, \w2, 6
  3442. + pasr.h \w3, \w3, 6
  3443. + .endm
  3444. +
  3445. +/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/
  3446. +
  3447. +h264_idct_add_avr32:
  3448. +
  3449. + stm --sp,r0-r3,r4-r7, lr
  3450. +
  3451. + /* Setup rounding factor. */
  3452. + mov r0, (1 << 5)
  3453. + lsl r0, 16
  3454. +
  3455. + /* Load block */
  3456. + ldm r11,r2-r9
  3457. + /* r9 = { w00, w01 },
  3458. + r8 = { w02, w03 },
  3459. + r7 = { w10, w11 },
  3460. + r6 = { w12, w13 },
  3461. + r5 = { w20, w21 },
  3462. + r4 = { w22, w23 },
  3463. + r3 = { w30, w31 },
  3464. + r2 = { w32, w33 } */
  3465. +
  3466. +
  3467. + /* Add the rounding factor to w00. */
  3468. + add r9, r0
  3469. +
  3470. + /* Transform rows */
  3471. + transform_row r9, r8, r0, r1
  3472. + transform_row r7, r6, r0, r1
  3473. + transform_row r5, r4, r0, r1
  3474. + transform_row r3, r2, r0, r1
  3475. +
  3476. + /* Transform columns */
  3477. + transform_2columns r9, r7, r5, r3, r0, r1
  3478. + transform_2columns r8, r6, r4, r2, r0, r1
  3479. +
  3480. + /* Load predicted pixels.*/
  3481. + ld.w lr, r12[0]
  3482. + ld.w r11, r12[r10]
  3483. +
  3484. + /* Unpack to halwords. */
  3485. + punpckub.h r0, lr:t
  3486. + punpckub.h r1, lr:b
  3487. +
  3488. + /* Add with transformed row. */
  3489. + padd.h r0, r0, r9
  3490. + paddx.h r1, r1, r8
  3491. + /* Pack and saturate back to 8-bit pixels. */
  3492. + packsh.ub r0, r0, r1
  3493. +
  3494. + /* Unpack to halwords. */
  3495. + punpckub.h lr, r11:t
  3496. + punpckub.h r11, r11:b
  3497. +
  3498. + /* Add with transformed row. */
  3499. + padd.h lr, lr, r7
  3500. + paddx.h r11, r11, r6
  3501. + /* Pack and saturate back to 8-bit pixels. */
  3502. + packsh.ub r1, lr, r11
  3503. +
  3504. + /* Store back to frame. */
  3505. + st.w r12[0], r0
  3506. + st.w r12[r10], r1
  3507. +
  3508. + add r12, r12, r10 << 1
  3509. +
  3510. + /* Load predicted pixels.*/
  3511. + ld.w lr, r12[0]
  3512. + ld.w r11, r12[r10]
  3513. +
  3514. + /* Unpack to halwords. */
  3515. + punpckub.h r0, lr:t
  3516. + punpckub.h r1, lr:b
  3517. +
  3518. + /* Add with transformed row. */
  3519. + padd.h r0, r0, r5
  3520. + paddx.h r1, r1, r4
  3521. + /* Pack and saturate back to 8-bit pixels. */
  3522. + packsh.ub r0, r0, r1
  3523. +
  3524. + /* Unpack to halwords. */
  3525. + punpckub.h lr, r11:t
  3526. + punpckub.h r11, r11:b
  3527. +
  3528. + /* Add with transformed row. */
  3529. + padd.h lr, lr, r3
  3530. + paddx.h r11, r11, r2
  3531. + /* Pack and saturate back to 8-bit pixels. */
  3532. + packsh.ub r1, lr, r11
  3533. +
  3534. + /* Store back to frame. */
  3535. + st.w r12[0], r0
  3536. + st.w r12[r10], r1
  3537. +
  3538. + ldm sp++,r0-r3,r4-r7, pc
  3539. +
  3540. +
  3541. + .global h264_idct8_add_avr32
  3542. +//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
  3543. +
  3544. +h264_idct8_add_avr32:
  3545. + stm --sp,r0-r3,r4-r7, lr
  3546. +
  3547. + /* Push dst and stride on stack */
  3548. + stm --sp,r10,r12
  3549. +
  3550. +// int i;
  3551. +// DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
  3552. +// uint8_t *cm = cropTbl + MAX_NEG_CROP;
  3553. +
  3554. +// block[0] += 32;
  3555. +
  3556. +
  3557. +// for( i = 0; i < 8; i++ )
  3558. +// {
  3559. + mov lr, 4
  3560. +0:
  3561. + ld.w r7, r11[0*(8*2)]
  3562. + ld.w r6, r11[1*(8*2)]
  3563. + ld.w r5, r11[2*(8*2)]
  3564. + ld.w r4, r11[3*(8*2)]
  3565. + ld.w r3, r11[4*(8*2)]
  3566. + ld.w r2, r11[5*(8*2)]
  3567. + ld.w r1, r11[6*(8*2)]
  3568. + ld.w r0, r11[7*(8*2)]
  3569. +
  3570. +/*
  3571. +
  3572. + const int a0 = src[0][i] + src[4][i];
  3573. + const int a2 = src[0][i] - src[4][i];
  3574. + const int a4 = (src[2][i]>>1) - src[6][i];
  3575. + const int a6 = (src[6][i]>>1) + src[2][i];
  3576. +*/
  3577. + padd.h r8, r7, r3 /* r8 = a0 */
  3578. + psub.h r7, r7, r3 /* r7 = a2 */
  3579. + pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */
  3580. + pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */
  3581. + psub.h r3, r3, r1 /* r3 = a4 */
  3582. + padd.h r9, r9, r5 /* r9 = a6 */
  3583. +
  3584. +/*
  3585. + const int b0 = a0 + a6;
  3586. + const int b2 = a2 + a4;
  3587. + const int b4 = a2 - a4;
  3588. + const int b6 = a0 - a6;
  3589. +*/
  3590. + padd.h r1, r8, r9 /* r1 = b0 */
  3591. + psub.h r8, r8, r9 /* r8 = b6 */
  3592. + padd.h r5, r7, r3 /* r5 = b2 */
  3593. + psub.h r7, r7, r3 /* r7 = b4 */
  3594. +
  3595. +/*
  3596. + const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
  3597. + const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
  3598. + const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
  3599. + const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
  3600. +*/
  3601. + pasr.h r3, r0, 1
  3602. + padd.h r3, r3, r0
  3603. + psub.h r3, r2, r3
  3604. + psub.h r3, r3, r4 /* r3 = a1 */
  3605. +
  3606. + pasr.h r9, r4, 1
  3607. + padd.h r9, r9, r4
  3608. + psub.h r9, r0, r9
  3609. + padd.h r9, r6, r9 /* r9 = a3 */
  3610. +
  3611. + pasr.h r10, r2, 1
  3612. + padd.h r10, r10, r2
  3613. + padd.h r10, r10, r0
  3614. + psub.h r10, r10, r6 /* r10 = a5 */
  3615. +
  3616. + pasr.h r0, r6, 1
  3617. + padd.h r0, r0, r6
  3618. + padd.h r0, r0, r2
  3619. + padd.h r0, r0, r4 /* r0 = a7 */
  3620. +/*
  3621. + const int b1 = (a7>>2) + a1;
  3622. + const int b3 = a3 + (a5>>2);
  3623. + const int b5 = (a3>>2) - a5;
  3624. + const int b7 = a7 - (a1>>2);
  3625. +*/
  3626. + pasr.h r2, r0, 2
  3627. + padd.h r2, r2, r3 /* r2 = b1 */
  3628. + pasr.h r3, r3, 2
  3629. + psub.h r3, r0, r3 /* r3 = b7 */
  3630. +
  3631. + pasr.h r0, r10, 2
  3632. + padd.h r0, r0, r9 /* r0 = b3 */
  3633. + pasr.h r9, r9, 2
  3634. + psub.h r9, r9, r10 /* r9 = b5 */
  3635. +
  3636. +
  3637. +/*
  3638. + src[0][i] = b0 + b7;
  3639. + src[7][i] = b0 - b7;
  3640. + src[1][i] = b2 + b5;
  3641. + src[6][i] = b2 - b5;
  3642. + src[2][i] = b4 + b3;
  3643. + src[5][i] = b4 - b3;
  3644. + src[3][i] = b6 + b1;
  3645. + src[4][i] = b6 - b1; */
  3646. +
  3647. + padd.h r4, r1, r3
  3648. + psub.h r1, r1, r3
  3649. + st.w r11[0*(8*2)], r4
  3650. + st.w r11[7*(8*2)], r1
  3651. +
  3652. + padd.h r3, r5, r9
  3653. + psub.h r5, r5, r9
  3654. + st.w r11[1*(8*2)], r3
  3655. + st.w r11[6*(8*2)], r5
  3656. +
  3657. + padd.h r9, r7, r0
  3658. + psub.h r7, r7, r0
  3659. + st.w r11[2*(8*2)], r9
  3660. + st.w r11[5*(8*2)], r7
  3661. +
  3662. + padd.h r0, r8, r2
  3663. + psub.h r8, r8, r2
  3664. + st.w r11[3*(8*2)], r0
  3665. + st.w r11[4*(8*2)], r8
  3666. +
  3667. + sub r11, -4
  3668. + sub lr, 1
  3669. + brne 0b
  3670. +
  3671. +// }
  3672. +
  3673. + lddsp r12, sp[0] /* r12 = dst */
  3674. + sub r11, 4*4
  3675. + ldm r11++, r4-r7
  3676. + mov lr, 8
  3677. + /* Push dst and stride on stack */
  3678. +
  3679. +1:
  3680. +// for( i = 0; i < 8; i++ )
  3681. +// {
  3682. +
  3683. + /* r7 = {src[i][0], src[i][1]}
  3684. + r6 = {src[i][2], src[i][3]}
  3685. + r5 = {src[i][4], src[i][5]}
  3686. + r4 = {src[i][6], src[i][7]} */
  3687. +
  3688. +/*
  3689. + const int a0 = src[i][0] + src[i][4];
  3690. + const int a2 = src[i][0] - src[i][4];
  3691. + const int a4 = (src[i][2]>>1) - src[i][6];
  3692. + const int a6 = (src[i][6]>>1) + src[i][2];
  3693. +*/
  3694. + pasr.h r8, r6, 1
  3695. + pasr.h r9, r4, 1
  3696. + addhh.w r0, r7:t, r5:t /* r0 = a0 */
  3697. + subhh.w r1, r7:t, r5:t /* r1 = a2 */
  3698. + subhh.w r2, r8:t, r4:t /* r2 = a4 */
  3699. + addhh.w r3, r9:t, r6:t /* r3 = a6 */
  3700. +
  3701. +/*
  3702. + const int b0 = a0 + a6;
  3703. + const int b2 = a2 + a4;
  3704. + const int b4 = a2 - a4;
  3705. + const int b6 = a0 - a6;
  3706. +*/
  3707. + add r10, r0, r3 /* r10 = b0 */
  3708. + sub r0, r3 /* r0 = b6 */
  3709. + add r3, r1, r2 /* r3 = b2 */
  3710. + sub r1, r2 /* r1 = b4 */
  3711. +/*
  3712. +
  3713. +
  3714. + const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1);
  3715. + const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1);
  3716. + const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1);
  3717. + const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */
  3718. + addhh.w r8, r8:b, r6:b
  3719. + addhh.w r2, r4:b, r7:b
  3720. + sub r2, r8 /* r2 = a3 */
  3721. +
  3722. + addhh.w r9, r9:b, r4:b
  3723. + subhh.w r8, r5:b, r6:b
  3724. + sub r8, r9 /* r8 = a1 */
  3725. +
  3726. + pasr.h r9, r7, 1
  3727. + addhh.w r9, r9:b, r7:b
  3728. + addhh.w r6, r5:b, r6:b
  3729. + add r6, r9 /* r6 = a7 */
  3730. +
  3731. + pasr.h r9, r5, 1
  3732. + addhh.w r9, r9:b, r5:b
  3733. + subhh.w r5, r4:b, r7:b
  3734. + add r5, r9 /* r5 = a5 */
  3735. +
  3736. +/* const int b1 = (a7>>2) + a1;
  3737. + const int b3 = (a5>>2) + a3;
  3738. + const int b5 = (a3>>2) - a5;
  3739. + const int b7 = -(a1>>2) + a7 ; */
  3740. + asr r4, r6, 2
  3741. + add r4, r8 /* r4 = b1 */
  3742. + asr r8, 2
  3743. + rsub r8, r6 /* r8 = b7 */
  3744. +
  3745. + asr r6, r5, 2
  3746. + add r6, r2 /* r6 = b3 */
  3747. + asr r2, 2
  3748. + sub r2, r5 /* r2 = b5 */
  3749. +
  3750. +/*
  3751. + dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ];
  3752. + dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ];
  3753. + dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ];
  3754. + dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ];
  3755. + dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ];
  3756. + dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ];
  3757. + dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ];
  3758. + dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ];
  3759. +*/
  3760. + add r5, r10, r8
  3761. + satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */
  3762. + sub r10, r8
  3763. + satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */
  3764. + add r8, r3, r2
  3765. + satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */
  3766. + sub r3, r2
  3767. + satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */
  3768. +
  3769. + add r2, r1, r6
  3770. + satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */
  3771. + sub r1, r6
  3772. + satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */
  3773. +
  3774. + add r6, r0, r4
  3775. + satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */
  3776. + sub r0, r4
  3777. + satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */
  3778. +
  3779. + ld.w r4, r12[0]
  3780. +
  3781. + packw.sh r8, r5, r8
  3782. + packw.sh r7, r2, r6
  3783. + ld.w r9, r12[4]
  3784. + packw.sh r6, r0, r1
  3785. + packw.sh r5, r3, r10
  3786. +
  3787. + punpckub.h r10, r4:t
  3788. + punpckub.h r4, r4:b
  3789. + punpckub.h r3, r9:t
  3790. + punpckub.h r9, r9:b
  3791. +
  3792. + padd.h r8, r8, r10
  3793. + padd.h r7, r7, r4
  3794. + padd.h r6, r6, r3
  3795. + padd.h r5, r5, r9
  3796. +
  3797. + lddsp r10, sp[4] /* r10 = stride */
  3798. + packsh.ub r0, r8, r7
  3799. + packsh.ub r1, r6, r5
  3800. +
  3801. + st.w r12[0], r0
  3802. + st.w r12[4], r1
  3803. +
  3804. + ldm r11++, r4-r7
  3805. + add r12, r10 /* dst += stride */
  3806. +
  3807. + sub lr, 1
  3808. + brne 1b
  3809. +
  3810. + sub sp, -8
  3811. + ldm sp++,r0-r3,r4-r7, pc
  3812. +
  3813. +
  3814. +
  3815. +// }
  3816. +//}
  3817. --- /dev/null
  3818. +++ b/libavcodec/avr32/idct.S
  3819. @@ -0,0 +1,829 @@
  3820. +/*
  3821. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  3822. + *
  3823. + * Redistribution and use in source and binary forms, with or without
  3824. + * modification, are permitted provided that the following conditions
  3825. + * are met:
  3826. + *
  3827. + * 1. Redistributions of source code must retain the above copyright
  3828. + * notice, this list of conditions and the following disclaimer.
  3829. + *
  3830. + * 2. Redistributions in binary form must reproduce the above
  3831. + * copyright notice, this list of conditions and the following
  3832. + * disclaimer in the documentation and/or other materials provided
  3833. + * with the distribution.
  3834. + *
  3835. + * 3. The name of ATMEL may not be used to endorse or promote products
  3836. + * derived from this software without specific prior written
  3837. + * permission.
  3838. + *
  3839. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  3840. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  3841. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  3842. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  3843. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  3844. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  3845. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  3846. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  3847. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  3848. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  3849. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  3850. + * DAMAGE.
  3851. + */
  3852. +
  3853. + .global idct_add_avr32
  3854. + .global idct_put_avr32
  3855. + .global idct_avr32
  3856. +
  3857. +
  3858. +#define CONST_BITS 13
  3859. +#define PASS1_BITS 2
  3860. +
  3861. +#define ONE ((INT32) 1)
  3862. +
  3863. +#define CONST_SCALE (ONE << CONST_BITS)
  3864. +
  3865. +#define LINE_SIZE 32
  3866. +
  3867. +#define FIX_0_298631336 (2446) /* FIX(0.298631336) */
  3868. +#define FIX_0_390180644 (3196) /* FIX(0.390180644) */
  3869. +#define FIX_0_541196100 (4433) /* FIX(0.541196100) */
  3870. +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
  3871. +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
  3872. +#define FIX_1_175875602 (9633) /* FIX(1.175875602) */
  3873. +#define FIX_1_501321110 (12299)/* FIX(1.501321110) */
  3874. +#define FIX_1_847759065 (15137)/* FIX(1.847759065) */
  3875. +#define FIX_1_961570560 (16069)/* FIX(1.961570560) */
  3876. +#define FIX_2_053119869 (16819)/* FIX(2.053119869) */
  3877. +#define FIX_2_562915447 (20995)/* FIX(2.562915447) */
  3878. +#define FIX_3_072711026 (25172)/* FIX(3.072711026) */
  3879. +
  3880. +
  3881. +#define loop_cnt r11
  3882. +
  3883. + .text
  3884. +
  3885. +idct_add_avr32:
  3886. + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
  3887. +
  3888. + // Give room for some variables on the stack
  3889. + sub sp, 8
  3890. + stdsp SP[0], r12 // rfp
  3891. + stdsp SP[4], r11 // iinc
  3892. +
  3893. + mov loop_cnt, 8 //Initialize loop counter
  3894. +
  3895. +FOR_ROW:
  3896. +
  3897. + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
  3898. + mov r6, 0
  3899. +#ifdef USE_PREFETCH
  3900. + pref r10[LINE_SIZE] //Prefetch next line
  3901. +#endif
  3902. + or r4, r2, r3 << 16
  3903. + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
  3904. + or r4, r0
  3905. + brne AC_ROW //If there are non-zero AC coeffisients perform row-transform
  3906. +
  3907. + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
  3908. + plsl.h r5, r5, PASS1_BITS
  3909. + mov r4, r5
  3910. + st.d r10++, r4
  3911. + st.d r10++, r4
  3912. +
  3913. + sub loop_cnt, 1 //Decrement loop counter
  3914. + brne FOR_ROW //Perform loop one more time if loop_cnt is not zero
  3915. +
  3916. + bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed
  3917. +
  3918. +
  3919. +AC_ROW:
  3920. +
  3921. +
  3922. + ld.w r12, pc[coef_table - .]
  3923. + ld.w r9, pc[coef_table - . + 4]
  3924. +
  3925. + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
  3926. + mulhh.w r5, r4:t, r12:t
  3927. + mulhh.w r6, r0:t, r12:b
  3928. + ld.w r12, pc[coef_table - . + 8]
  3929. + mulhh.w r7, r2:t, r9:t
  3930. + add r6, r5 // tmp2
  3931. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  3932. + add r7, r5 // tmp3
  3933. + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
  3934. +
  3935. + paddsub.h r5, r3:t, r1:t
  3936. + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
  3937. +
  3938. + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
  3939. + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
  3940. +
  3941. +
  3942. + addhh.w lr, r3:b, r1:b // lr = z4
  3943. + addhh.w r5, r4:b, lr:b
  3944. + mulhh.w r5, r5:b, r9:b // r5 = z5
  3945. +
  3946. + ld.w r9, pc[coef_table - . + 12]
  3947. + mulhh.w r4, r4:b, r12:t // r4 = z3
  3948. + mulhh.w lr, lr:b, r12:b // lr = z4
  3949. +
  3950. + add r4, r5
  3951. + add lr, r5
  3952. +
  3953. + addhh.w r5, r2:b, r1:b // r5 = z2
  3954. + addhh.w r8, r3:b, r0:b // r8 = z1
  3955. +
  3956. +
  3957. + mulhh.w r0, r0:b, r9:t // r0 = tmp0
  3958. + ld.w r12, pc[coef_table - . + 16]
  3959. + mulhh.w r1, r1:b, r9:b // r1 = tmp1
  3960. + ld.w r9, pc[coef_table - . + 20]
  3961. + mulhh.w r2, r2:b, r12:t // r2 = tmp2
  3962. + mulhh.w r3, r3:b, r12:b // r3 = tmp3
  3963. + mulhh.w r8, r8:b, r9:t // r8 = z1
  3964. + mulhh.w r5, r5:b, r9:b // r5 = z2
  3965. +
  3966. +
  3967. + add r0, r8
  3968. + add r0, r4
  3969. + add r1, r5
  3970. + add r1, lr
  3971. + add r2, r5
  3972. + add r2, r4
  3973. + add r3, r8
  3974. + add r3, lr
  3975. +
  3976. + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
  3977. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  3978. + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
  3979. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  3980. +
  3981. + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
  3982. + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
  3983. + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
  3984. + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
  3985. +
  3986. + sthh.w r10[0], r4:t, r5:t
  3987. + sthh.w r10[4], r3:t, r2:t
  3988. + sthh.w r10[8], r2:b, r3:b
  3989. + sthh.w r10[12], r5:b, r4:b
  3990. +
  3991. +
  3992. +
  3993. + sub r10, -16
  3994. + sub loop_cnt, 1
  3995. + brne FOR_ROW, e
  3996. +
  3997. +COLOUMN_TRANSFORM:
  3998. +
  3999. + sub r10, 128 //Set pointer to start of DCT block
  4000. +
  4001. +
  4002. + mov loop_cnt, 8
  4003. +FOR_COLOUMN:
  4004. + ldins.h r3:t,r10[0] // r3:t = dataptr[0]
  4005. + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
  4006. + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
  4007. + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
  4008. + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
  4009. + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
  4010. + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
  4011. + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
  4012. +
  4013. + or r4, r1, r3 << 16
  4014. + or r4, r2
  4015. + or r4, r0
  4016. + brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform
  4017. +
  4018. + lddsp r12, SP[0] // rfp
  4019. + lddsp r9, SP[4] // iinc
  4020. + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9
  4021. + ld.d r0, r12[0]
  4022. + sub r10, -2 // Increment the dataptr
  4023. + bfins r3, r3, 16, 16
  4024. + punpckub.h r2, r1:t
  4025. + padd.h r2, r2, r3
  4026. + punpckub.h r1, r1:b
  4027. + padd.h r1, r1, r3
  4028. + packsh.ub r1, r2, r1
  4029. + punpckub.h r2, r0:t
  4030. + padd.h r2, r2, r3
  4031. + punpckub.h r0, r0:b
  4032. + padd.h r0, r0, r3
  4033. + packsh.ub r0, r2, r0
  4034. + st.d r12[0], r0
  4035. + add r12, r9 // increment rfp
  4036. + stdsp SP[0], r12
  4037. +
  4038. + sub loop_cnt, 1//Decrement loop counter
  4039. + brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero
  4040. +
  4041. + sub sp, -8
  4042. + popm r0-r3, r4-r7, pc//Pop back registers and PC
  4043. +
  4044. +AC_COLOUMN:
  4045. +
  4046. + ld.w r12, pc[coef_table - .]
  4047. + ld.w r9, pc[coef_table - . + 4]
  4048. +
  4049. + addhh.w r4, r2:t, r2:b
  4050. + mulhh.w r4, r4:b, r12:t // r4 = z1
  4051. + mulhh.w r5, r2:b, r12:b
  4052. + ld.w r12, pc[coef_table - . + 8]
  4053. + mulhh.w r6, r2:t, r9:t
  4054. + add r5, r4 // r5 = tmp2
  4055. + add r6, r4 // r6 = tmp3
  4056. +
  4057. + addhh.w r7, r3:t, r3:b
  4058. + subhh.w r8, r3:t, r3:b
  4059. +
  4060. + lsl r7, CONST_BITS
  4061. + lsl r8, CONST_BITS
  4062. +
  4063. + add r2, r7, r6 // r2 = tmp10
  4064. + sub r3, r7, r6 // r3 = tmp13
  4065. + add r4, r8, r5 // r4 = tmp11
  4066. + sub r5, r8, r5 // r5 = tmp12
  4067. +
  4068. + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
  4069. + addhh.w r7, r6:t, r6:b
  4070. + mulhh.w r7, r7:b, r9:b // r7 = z5
  4071. +
  4072. + ld.w r9, pc[coef_table - . + 12]
  4073. + mulhh.w r8, r6:b, r12:t // r8 = z3
  4074. + mulhh.w r6, r6:t, r12:b // r6 = z4
  4075. +
  4076. + add r8, r7
  4077. + add r6, r7
  4078. +
  4079. + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
  4080. +
  4081. + mulhh.w r12, r0:b, r9:t // r12 = tmp0
  4082. + mulhh.w r0, r0:t, r9:b // r0 = tmp1
  4083. + ld.w r9, pc[coef_table - . + 16]
  4084. + add r12, r8
  4085. + add r0, r6
  4086. +
  4087. + ld.w lr, pc[coef_table - . + 20]
  4088. + machh.w r8, r1:b, r9:t // r8 = tmp2
  4089. + machh.w r6, r1:t, r9:b // r6 = tmp3
  4090. + mulhh.w r9, r7:b, lr:t // r9 = z1
  4091. + mulhh.w r7, r7:t, lr:b // r7 = z2
  4092. +
  4093. +
  4094. + add r12, r9
  4095. + add r0, r7
  4096. + add r8, r7
  4097. + add r6, r9
  4098. +
  4099. + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
  4100. + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
  4101. + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
  4102. + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
  4103. + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
  4104. + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
  4105. + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
  4106. + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
  4107. +
  4108. + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
  4109. + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
  4110. + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
  4111. + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
  4112. + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
  4113. + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
  4114. + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
  4115. + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
  4116. +
  4117. + packw.sh r1, r1, r6
  4118. + packw.sh r8, r8, r0
  4119. + packw.sh r3, r3, r5
  4120. + packw.sh r4, r4, r2
  4121. +
  4122. + lddsp r12, SP[0] // rfp
  4123. + lddsp r9, SP[4] // iinc
  4124. + ld.d r6, r12[0]
  4125. + sub r10, -2 // Increment the dataptr
  4126. + punpckub.h r0, r7:t
  4127. + padd.h r1, r1, r0
  4128. + punpckub.h r0, r7:b
  4129. + padd.h r8, r8, r0
  4130. + packsh.ub r7, r1, r8
  4131. + punpckub.h r0, r6:t
  4132. + padd.h r3, r3, r0
  4133. + punpckub.h r0, r6:b
  4134. + padd.h r4, r4, r0
  4135. + packsh.ub r6, r3, r4
  4136. + st.d r12[0], r6
  4137. + add r12, r9 // increment rfp
  4138. + stdsp SP[0], r12
  4139. +
  4140. + sub loop_cnt, 1 //Decrement loop counter
  4141. + brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero
  4142. +
  4143. + sub sp, -8
  4144. + popm r0-r3, r4-r7, pc //Pop back registers and PC
  4145. +
  4146. +
  4147. +
  4148. +//Coeffisient Table:
  4149. + .align 2
  4150. +coef_table:
  4151. + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
  4152. + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
  4153. + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
  4154. +
  4155. +
  4156. +idct_put_avr32:
  4157. + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
  4158. +
  4159. + //; Give room for some variables on the stack
  4160. + sub sp, 8
  4161. + stdsp SP[0], r12 // rfp
  4162. + stdsp SP[4], r11 // iinc
  4163. +
  4164. + mov loop_cnt, 8 //Initialize loop counter
  4165. +
  4166. +0:
  4167. +
  4168. + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
  4169. + mov r6, 0
  4170. +#ifdef USE_PREFETCH
  4171. + pref r10[LINE_SIZE] //Prefetch next line
  4172. +#endif
  4173. + or r4, r2, r3 << 16
  4174. + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
  4175. + or r4, r0
  4176. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4177. +
  4178. + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
  4179. + plsl.h r5, r5, PASS1_BITS
  4180. + mov r4, r5
  4181. + st.d r10++, r4
  4182. + st.d r10++, r4
  4183. +
  4184. + sub loop_cnt, 1 //Decrement loop counter
  4185. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4186. +
  4187. + bral 2f //Perform coloumn transform after row transform is computed
  4188. +
  4189. +1:
  4190. +
  4191. + ld.w r12, pc[coef_table_copy - .]
  4192. + ld.w r9, pc[coef_table_copy - . + 4]
  4193. +
  4194. + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
  4195. + mulhh.w r5, r4:t, r12:t
  4196. + mulhh.w r6, r0:t, r12:b
  4197. + ld.w r12, pc[coef_table_copy - . + 8]
  4198. + mulhh.w r7, r2:t, r9:t
  4199. + add r6, r5 // tmp2
  4200. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  4201. + add r7, r5 // tmp3
  4202. + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
  4203. +
  4204. + paddsub.h r5, r3:t, r1:t
  4205. + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
  4206. +
  4207. + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
  4208. + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
  4209. +
  4210. +
  4211. +
  4212. + addhh.w lr, r3:b, r1:b // lr = z4
  4213. + addhh.w r5, r4:b, lr:b
  4214. + mulhh.w r5, r5:b, r9:b // r5 = z5
  4215. +
  4216. + ld.w r9, pc[coef_table_copy - . + 12]
  4217. + mulhh.w r4, r4:b, r12:t // r4 = z3
  4218. + mulhh.w lr, lr:b, r12:b // lr = z4
  4219. +
  4220. + add r4, r5
  4221. + add lr, r5
  4222. +
  4223. + addhh.w r5, r2:b, r1:b // r5 = z2
  4224. + addhh.w r8, r3:b, r0:b // r8 = z1
  4225. +
  4226. +
  4227. + mulhh.w r0, r0:b, r9:t // r0 = tmp0
  4228. + ld.w r12, pc[coef_table_copy - . + 16]
  4229. + mulhh.w r1, r1:b, r9:b // r1 = tmp1
  4230. + ld.w r9, pc[coef_table_copy - . + 20]
  4231. + mulhh.w r2, r2:b, r12:t // r2 = tmp2
  4232. + mulhh.w r3, r3:b, r12:b // r3 = tmp3
  4233. + mulhh.w r8, r8:b, r9:t // r8 = z1
  4234. + mulhh.w r5, r5:b, r9:b // r5 = z2
  4235. +
  4236. +
  4237. + add r0, r8
  4238. + add r0, r4
  4239. + add r1, r5
  4240. + add r1, lr
  4241. + add r2, r5
  4242. + add r2, r4
  4243. + add r3, r8
  4244. + add r3, lr
  4245. +
  4246. + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
  4247. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  4248. + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
  4249. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  4250. +
  4251. + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
  4252. + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
  4253. + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
  4254. + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
  4255. +
  4256. + sthh.w r10[0], r4:t, r5:t
  4257. + sthh.w r10[4], r3:t, r2:t
  4258. + sthh.w r10[8], r2:b, r3:b
  4259. + sthh.w r10[12], r5:b, r4:b
  4260. +
  4261. +
  4262. +
  4263. + sub r10, -16
  4264. + sub loop_cnt, 1
  4265. + brne 0b
  4266. +
  4267. +2:
  4268. +
  4269. + sub r10, 128 //Set pointer to start of DCT block
  4270. +
  4271. + mov loop_cnt, 8
  4272. +
  4273. +0:
  4274. + ldins.h r3:t,r10[0] // r3:t = dataptr[0]
  4275. + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
  4276. + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
  4277. + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
  4278. + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
  4279. + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
  4280. + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
  4281. + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
  4282. +
  4283. + or r4, r1, r3 << 16
  4284. + or r4, r2
  4285. + or r4, r0
  4286. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4287. +
  4288. + lddsp r12, SP[0] // rfp
  4289. + lddsp r9, SP[4] // iinc
  4290. + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
  4291. + packw.sh r3, r3, r3
  4292. + packsh.ub r3, r3, r3
  4293. + mov r2, r3
  4294. + st.d r12[0], r2
  4295. + add r12, r9 // increment rfp
  4296. + sub r10, -2 // Increment the dataptr
  4297. + stdsp SP[0], r12
  4298. +
  4299. + sub loop_cnt, 1//Decrement loop counter
  4300. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4301. +
  4302. + sub sp, -8
  4303. + popm r0-r3, r4-r7, pc//Pop back registers and PC
  4304. +
  4305. +1:
  4306. +
  4307. + ld.w r12, pc[coef_table_copy - .]
  4308. + ld.w r9, pc[coef_table_copy - . + 4]
  4309. +
  4310. + addhh.w r4, r2:t, r2:b
  4311. + mulhh.w r4, r4:b, r12:t // r4 = z1
  4312. + mulhh.w r5, r2:b, r12:b
  4313. + ld.w r12, pc[coef_table_copy - . + 8]
  4314. + mulhh.w r6, r2:t, r9:t
  4315. + add r5, r4 // r5 = tmp2
  4316. + add r6, r4 // r6 = tmp3
  4317. +
  4318. + addhh.w r7, r3:t, r3:b
  4319. + subhh.w r8, r3:t, r3:b
  4320. +
  4321. + lsl r7, CONST_BITS
  4322. + lsl r8, CONST_BITS
  4323. +
  4324. + add r2, r7, r6 // r2 = tmp10
  4325. + sub r3, r7, r6 // r3 = tmp13
  4326. + add r4, r8, r5 // r4 = tmp11
  4327. + sub r5, r8, r5 // r5 = tmp12
  4328. +
  4329. +
  4330. + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
  4331. + addhh.w r7, r6:t, r6:b
  4332. + mulhh.w r7, r7:b, r9:b // r7 = z5
  4333. +
  4334. + ld.w r9, pc[coef_table_copy - . + 12]
  4335. + mulhh.w r8, r6:b, r12:t // r8 = z3
  4336. + mulhh.w r6, r6:t, r12:b // r6 = z4
  4337. +
  4338. + add r8, r7
  4339. + add r6, r7
  4340. +
  4341. + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
  4342. +
  4343. + mulhh.w r12, r0:b, r9:t // r12 = tmp0
  4344. + mulhh.w r0, r0:t, r9:b // r0 = tmp1
  4345. + ld.w r9, pc[coef_table_copy - . + 16]
  4346. + add r12, r8
  4347. + add r0, r6
  4348. +
  4349. + ld.w lr, pc[coef_table_copy - . + 20]
  4350. + machh.w r8, r1:b, r9:t // r8 = tmp2
  4351. + machh.w r6, r1:t, r9:b // r6 = tmp3
  4352. + mulhh.w r9, r7:b, lr:t // r9 = z1
  4353. + mulhh.w r7, r7:t, lr:b // r7 = z2
  4354. +
  4355. +
  4356. + add r12, r9
  4357. + add r0, r7
  4358. + add r8, r7
  4359. + add r6, r9
  4360. +
  4361. + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
  4362. + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
  4363. + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
  4364. + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
  4365. + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
  4366. + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
  4367. + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
  4368. + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
  4369. +
  4370. + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
  4371. + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
  4372. + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
  4373. + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
  4374. + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
  4375. + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
  4376. + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
  4377. + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
  4378. +
  4379. + packw.sh r1, r1, r6
  4380. + packw.sh r8, r8, r0
  4381. + packw.sh r3, r3, r5
  4382. + packw.sh r4, r4, r2
  4383. +
  4384. + packsh.ub r1, r1, r8
  4385. + packsh.ub r0, r3, r4
  4386. + lddsp r12, SP[0] // rfp
  4387. + lddsp r9, SP[4] // iinc
  4388. + st.d r12[0], r0
  4389. + sub r10, -2 // Increment the dataptr
  4390. + add r12, r9 // increment rfp
  4391. + stdsp SP[0], r12
  4392. +
  4393. + sub loop_cnt, 1 //Decrement loop counter
  4394. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4395. +
  4396. + sub sp, -8
  4397. + popm r0-r3, r4-r7, pc //Pop back registers and PC
  4398. +
  4399. +
  4400. +
  4401. + .align 2
  4402. +coef_table_copy:
  4403. + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
  4404. + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
  4405. + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
  4406. +
  4407. +
  4408. +idct_avr32:
  4409. + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
  4410. +
  4411. + //; Give room for a temporary block on the stack
  4412. + sub sp, 8*8*2
  4413. +
  4414. + mov loop_cnt, 8 //Initialize loop counter
  4415. +
  4416. +0:
  4417. +
  4418. + ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
  4419. + mov r6, 0
  4420. +#ifdef USE_PREFETCH
  4421. + pref r12[LINE_SIZE] //Prefetch next line
  4422. +#endif
  4423. + or r4, r2, r3 << 16
  4424. + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
  4425. + or r4, r0
  4426. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4427. +
  4428. + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
  4429. + plsl.h r5, r5, PASS1_BITS
  4430. + mov r4, r5
  4431. + st.d sp++, r4
  4432. + st.d sp++, r4
  4433. +
  4434. + sub loop_cnt, 1 //Decrement loop counter
  4435. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4436. +
  4437. + bral 2f //Perform coloumn transform after row transform is computed
  4438. +
  4439. +1:
  4440. +
  4441. + ld.w r10, pc[coef_table_idct - .]
  4442. + ld.w r9, pc[coef_table_idct - . + 4]
  4443. +
  4444. + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
  4445. + mulhh.w r5, r4:t, r10:t
  4446. + mulhh.w r6, r0:t, r10:b
  4447. + ld.w r10, pc[coef_table_idct - . + 8]
  4448. + mulhh.w r7, r2:t, r9:t
  4449. + add r6, r5 // tmp2
  4450. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  4451. + add r7, r5 // tmp3
  4452. + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
  4453. +
  4454. + paddsub.h r5, r3:t, r1:t
  4455. + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
  4456. +
  4457. + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
  4458. + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
  4459. +
  4460. +
  4461. +
  4462. + addhh.w lr, r3:b, r1:b // lr = z4
  4463. + addhh.w r5, r4:b, lr:b
  4464. + mulhh.w r5, r5:b, r9:b // r5 = z5
  4465. +
  4466. + ld.w r9, pc[coef_table_idct - . + 12]
  4467. + mulhh.w r4, r4:b, r10:t // r4 = z3
  4468. + mulhh.w lr, lr:b, r10:b // lr = z4
  4469. +
  4470. + add r4, r5
  4471. + add lr, r5
  4472. +
  4473. + addhh.w r5, r2:b, r1:b // r5 = z2
  4474. + addhh.w r8, r3:b, r0:b // r8 = z1
  4475. +
  4476. +
  4477. + mulhh.w r0, r0:b, r9:t // r0 = tmp0
  4478. + ld.w r10, pc[coef_table_idct - . + 16]
  4479. + mulhh.w r1, r1:b, r9:b // r1 = tmp1
  4480. + ld.w r9, pc[coef_table_idct - . + 20]
  4481. + mulhh.w r2, r2:b, r10:t // r2 = tmp2
  4482. + mulhh.w r3, r3:b, r10:b // r3 = tmp3
  4483. + mulhh.w r8, r8:b, r9:t // r8 = z1
  4484. + mulhh.w r5, r5:b, r9:b // r5 = z2
  4485. +
  4486. +
  4487. + add r0, r8
  4488. + add r0, r4
  4489. + add r1, r5
  4490. + add r1, lr
  4491. + add r2, r5
  4492. + add r2, r4
  4493. + add r3, r8
  4494. + add r3, lr
  4495. +
  4496. + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
  4497. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  4498. + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
  4499. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  4500. +
  4501. + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
  4502. + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
  4503. + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
  4504. + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
  4505. +
  4506. + sthh.w sp[0], r4:t, r5:t
  4507. + sthh.w sp[4], r3:t, r2:t
  4508. + sthh.w sp[8], r2:b, r3:b
  4509. + sthh.w sp[12], r5:b, r4:b
  4510. +
  4511. +
  4512. +
  4513. + sub sp, -16
  4514. + sub loop_cnt, 1
  4515. + brne 0b
  4516. +
  4517. +2:
  4518. +
  4519. + sub sp, 8*8*2 //Set pointer to start of DCT block
  4520. + sub r12, 8*8*2 //Set pointer to start of DCT block
  4521. +
  4522. + mov loop_cnt, 8
  4523. +
  4524. +0:
  4525. + ldins.h r3:t,sp[0] // r3:t = dataptr[0]
  4526. + ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1]
  4527. + ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2]
  4528. + ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5]
  4529. + ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4]
  4530. + ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3]
  4531. + ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6]
  4532. + ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7]
  4533. +
  4534. + or r4, r1, r3 << 16
  4535. + or r4, r2
  4536. + or r4, r0
  4537. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4538. +
  4539. + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
  4540. + packw.sh r3, r3, r3
  4541. + mov r2, r3
  4542. + st.d r12++, r2
  4543. + st.d r12++, r2
  4544. + sub sp, -2 // Increment the dataptr
  4545. +
  4546. + sub loop_cnt, 1//Decrement loop counter
  4547. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4548. +
  4549. + sub sp, -(8*8*2 - 8)
  4550. + popm r0-r3, r4-r7, pc//Pop back registers and PC
  4551. +
  4552. +1:
  4553. +
  4554. + ld.w r10, pc[coef_table_idct - .]
  4555. + ld.w r9, pc[coef_table_idct - . + 4]
  4556. +
  4557. + addhh.w r4, r2:t, r2:b
  4558. + mulhh.w r4, r4:b, r10:t // r4 = z1
  4559. + mulhh.w r5, r2:b, r10:b
  4560. + ld.w r10, pc[coef_table_idct - . + 8]
  4561. + mulhh.w r6, r2:t, r9:t
  4562. + add r5, r4 // r5 = tmp2
  4563. + add r6, r4 // r6 = tmp3
  4564. +
  4565. + addhh.w r7, r3:t, r3:b
  4566. + subhh.w r8, r3:t, r3:b
  4567. +
  4568. + lsl r7, CONST_BITS
  4569. + lsl r8, CONST_BITS
  4570. +
  4571. + add r2, r7, r6 // r2 = tmp10
  4572. + sub r3, r7, r6 // r3 = tmp13
  4573. + add r4, r8, r5 // r4 = tmp11
  4574. + sub r5, r8, r5 // r5 = tmp12
  4575. +
  4576. +
  4577. + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
  4578. + addhh.w r7, r6:t, r6:b
  4579. + mulhh.w r7, r7:b, r9:b // r7 = z5
  4580. +
  4581. + ld.w r9, pc[coef_table_idct - . + 12]
  4582. + mulhh.w r8, r6:b, r10:t // r8 = z3
  4583. + mulhh.w r6, r6:t, r10:b // r6 = z4
  4584. +
  4585. + add r8, r7
  4586. + add r6, r7
  4587. +
  4588. + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
  4589. +
  4590. + mulhh.w r10, r0:b, r9:t // r10 = tmp0
  4591. + mulhh.w r0, r0:t, r9:b // r0 = tmp1
  4592. + ld.w r9, pc[coef_table_idct - . + 16]
  4593. + add r10, r8
  4594. + add r0, r6
  4595. +
  4596. + ld.w lr, pc[coef_table_idct - . + 20]
  4597. + machh.w r8, r1:b, r9:t // r8 = tmp2
  4598. + machh.w r6, r1:t, r9:b // r6 = tmp3
  4599. + mulhh.w r9, r7:b, lr:t // r9 = z1
  4600. + mulhh.w r7, r7:t, lr:b // r7 = z2
  4601. +
  4602. +
  4603. + add r10, r9
  4604. + add r0, r7
  4605. + add r8, r7
  4606. + add r6, r9
  4607. +
  4608. + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
  4609. + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
  4610. + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
  4611. + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
  4612. + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
  4613. + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
  4614. + add r0, r3, r10 // r0 = dataptr[DCTSIZE*3]
  4615. + sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4]
  4616. +
  4617. + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
  4618. + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
  4619. + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
  4620. + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
  4621. + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
  4622. + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
  4623. + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
  4624. + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
  4625. +
  4626. + packw.sh r7, r1, r6
  4627. + packw.sh r6, r8, r0
  4628. + packw.sh r5, r3, r5
  4629. + packw.sh r4, r4, r2
  4630. +
  4631. + stm r12, r4-r7
  4632. + sub sp, -2 // Increment the dataptr
  4633. + sub r12, -16
  4634. +
  4635. + sub loop_cnt, 1 //Decrement loop counter
  4636. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4637. +
  4638. + sub sp, -(8*8*2 - 8)
  4639. + popm r0-r3, r4-r7, pc //Pop back registers and PC
  4640. +
  4641. +
  4642. +
  4643. + .align 2
  4644. +coef_table_idct:
  4645. + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
  4646. + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
  4647. + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
  4648. +
  4649. --- /dev/null
  4650. +++ b/libavcodec/avr32/mc.S
  4651. @@ -0,0 +1,434 @@
  4652. +/*
  4653. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  4654. + *
  4655. + * Redistribution and use in source and binary forms, with or without
  4656. + * modification, are permitted provided that the following conditions
  4657. + * are met:
  4658. + *
  4659. + * 1. Redistributions of source code must retain the above copyright
  4660. + * notice, this list of conditions and the following disclaimer.
  4661. + *
  4662. + * 2. Redistributions in binary form must reproduce the above
  4663. + * copyright notice, this list of conditions and the following
  4664. + * disclaimer in the documentation and/or other materials provided
  4665. + * with the distribution.
  4666. + *
  4667. + * 3. The name of ATMEL may not be used to endorse or promote products
  4668. + * derived from this software without specific prior written
  4669. + * permission.
  4670. + *
  4671. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  4672. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  4673. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  4674. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  4675. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  4676. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  4677. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  4678. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  4679. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  4680. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  4681. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  4682. + * DAMAGE.
  4683. + */
  4684. +
  4685. +
  4686. + /* Macro for masking the lowest bit of each byte in a
  4687. + packed word */
  4688. + .macro packedmask1 reg, round
  4689. + .if \round
  4690. + and \reg, \reg, r8 >> 1
  4691. + .else
  4692. + and \reg, r8
  4693. + .endif
  4694. + .endm
  4695. +
  4696. + /* Macro for 8 pixel wide horizontal and vertical interpolation functions */
  4697. + .macro pixels8_hv round, put
  4698. +
  4699. +
  4700. + pushm r0-r7, lr
  4701. +
  4702. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4703. +
  4704. + /* Rounding immediate */
  4705. + .if \round
  4706. + mov r8, lo(0x02020202)
  4707. + orh r8, hi(0x02020202)
  4708. + .else
  4709. + mov r8, lo(0x01010101)
  4710. + orh r8, hi(0x01010101)
  4711. + .endif
  4712. + mov r7, 2
  4713. +
  4714. + /* Pixel naming convention :
  4715. +
  4716. + |-----------------------------------------------------|
  4717. + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 |
  4718. + |----d00---d01---d02---d03---d04---d05---d06---d07----|
  4719. + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 |
  4720. + |-----------------------------------------------------|
  4721. + */
  4722. +1:
  4723. + ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 }
  4724. + ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 }
  4725. + mov lr, r9
  4726. + eor r2, r0, r1
  4727. + packedmask1 r2, \round
  4728. + add r2, r8
  4729. +
  4730. + paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4731. +
  4732. + add r11, r10 // pixels += line_size
  4733. + ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 }
  4734. + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
  4735. +0:
  4736. + eor r5, r1, r3
  4737. + packedmask1 r5, \round
  4738. + add r2, r5
  4739. +
  4740. + paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2}
  4741. + eor r6, r0, r1
  4742. + packedmask1 r6, \round
  4743. + add r2, r2, r6 << 1
  4744. +
  4745. + ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 }
  4746. + add r11, r10 // pixels += line_size
  4747. + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
  4748. +
  4749. + paddh.ub r0, r0, r1
  4750. + plsr.b r2, r2, 2
  4751. + padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 }
  4752. +
  4753. + /* Next row */
  4754. + .if \put
  4755. + eor r2, r3, r4
  4756. + packedmask1 r2, \round
  4757. + add r2, r8
  4758. + .else
  4759. + ld.w r6, r12[0]
  4760. + eor r2, r3, r4
  4761. + packedmask1 r2, \round
  4762. + add r2, r8
  4763. + pavg.ub r0, r0, r6
  4764. + .endif
  4765. + st.w r12[0], r0 // Put data into the block
  4766. +
  4767. + add r5, r2
  4768. + paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4769. +
  4770. + eor r6, r0, r1
  4771. + packedmask1 r6, \round
  4772. + add r5, r5, r6 << 1
  4773. +
  4774. + .if \put
  4775. + paddh.ub r1, r0, r1
  4776. + plsr.b r5, r5, 2
  4777. + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
  4778. + .else
  4779. + ld.w r3, r12[r10]
  4780. + paddh.ub r1, r0, r1
  4781. + plsr.b r5, r5, 2
  4782. + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
  4783. + pavg.ub r1, r1, r3
  4784. + .endif
  4785. +
  4786. + st.w r12[r10], r1 // Put data into the block
  4787. +
  4788. +
  4789. + ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 }
  4790. + add r11, r10 // pixels += line_size
  4791. + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
  4792. + add r12, r12, r10 << 1 // block += 2*line_size
  4793. + sub lr, 2
  4794. + brne 0b
  4795. +
  4796. + mul r0, r10, r9 // r0 = line_size * h
  4797. + rsub r0, r0, 4 // r0 = 4 - (line_size * h)
  4798. + add r11, r0
  4799. + sub r11, r10 // pixels += 4 - (line_size * (h+1))
  4800. + add r12, r0 // pixels += 4 - (line_size * (h))
  4801. + sub r7, 1
  4802. + brne 1b
  4803. +
  4804. + popm r0-r7, pc
  4805. + .endm
  4806. +
  4807. +
  4808. + /* Macro for 8 pixel wide vertical interpolation functions */
  4809. +
  4810. + .macro pixels8_v round, put
  4811. + pushm r4-r7,lr
  4812. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4813. +
  4814. + /*
  4815. + Pixel Naming Convention :
  4816. + |-----------------------------------------------|
  4817. + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 |
  4818. + |-d00---d01---d02---d03---d04---d05---d06---d07-|
  4819. + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 |
  4820. + |-----------------------------------------------|
  4821. + */
  4822. + ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 }
  4823. + ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4
  4824. + ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 }
  4825. + ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 }
  4826. + sub r10, 4 // stride -= 4
  4827. + add r11, r11, r10 << 1 // src += 2*stride
  4828. + sub r11, -4 // src += 4
  4829. +
  4830. +0:
  4831. + .if \round
  4832. + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4833. + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4834. + .else
  4835. + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4836. + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4837. + .endif
  4838. +
  4839. + .if \put
  4840. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4841. + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
  4842. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4843. + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
  4844. + .else
  4845. + ld.w lr, r12[0]
  4846. + ld.w r7, r12[4]
  4847. + pavg.ub r5, r5, lr
  4848. + pavg.ub r4, r4, r7
  4849. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4850. + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
  4851. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4852. + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
  4853. + .endif
  4854. + add r11, r10 // src += stride
  4855. +#ifdef USE_PREFETCH
  4856. + pref r11[0]
  4857. +#endif
  4858. + add r12, r10 // dst += stride
  4859. +
  4860. + .if \round
  4861. + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4862. + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4863. + .else
  4864. + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4865. + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4866. + .endif
  4867. + .if \put
  4868. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4869. + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
  4870. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4871. + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
  4872. + .else
  4873. + ld.w r8, r12[0]
  4874. + ld.w r6, r12[4]
  4875. + pavg.ub r5, r5, r8
  4876. + pavg.ub r4, r4, r6
  4877. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4878. + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
  4879. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4880. + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
  4881. + .endif
  4882. +
  4883. + add r11, r10 // src += stride
  4884. +#ifdef USE_PREFETCH
  4885. + pref r11[0]
  4886. +#endif
  4887. + add r12, r10 // dst += stride
  4888. + sub r9, 2
  4889. + brne 0b
  4890. +
  4891. + popm r4-r7,pc
  4892. + .endm
  4893. +
  4894. + /* Macro for 8 pixel wide horizontal interpolation functions */
  4895. +
  4896. + .macro pixels8_h round, put
  4897. + pushm r4-r7, lr
  4898. +
  4899. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4900. + /*
  4901. + Pixel Naming Convention:
  4902. + |--------------------------------------------------------------------|
  4903. + | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08|
  4904. + |------|-------|-------|-------|-------|-------|-------|-------|-----|
  4905. + | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18|
  4906. + |--------------------------------------------------------------------|
  4907. + */
  4908. +
  4909. + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
  4910. + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
  4911. + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
  4912. + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
  4913. + add r11, r10 // src += stride
  4914. +
  4915. +0:
  4916. + .if \round
  4917. + pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4918. + pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4919. + .else
  4920. + paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4921. + paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4922. + .endif
  4923. + .if \put
  4924. + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
  4925. + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
  4926. + .else
  4927. + ld.w r8, r12[0]
  4928. + ld.w r6, r12[4]
  4929. + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
  4930. + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
  4931. + pavg.ub lr, lr, r8
  4932. + pavg.ub r7, r7, r6
  4933. + .endif
  4934. + st.w r12[0], lr // dst = { d00, d01, d02, d03 }
  4935. + st.w r12[4], r7 // dst = { d04, d05, d06, d07 }
  4936. + ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 }
  4937. + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
  4938. + add r11, r10 // src += stride
  4939. +#ifdef USE_PREFETCH
  4940. + pref r11[0]
  4941. +#endif
  4942. + add r12, r10 // dst += stride
  4943. +
  4944. + .if \round
  4945. + pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4946. + pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4947. + .else
  4948. + paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4949. + paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4950. + .endif
  4951. + .if \put
  4952. + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
  4953. + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
  4954. + .else
  4955. + ld.w r7, r12[0]
  4956. + ld.w r6, r12[4]
  4957. + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
  4958. + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
  4959. + pavg.ub r5, r5, r7
  4960. + pavg.ub r4, r4, r6
  4961. + .endif
  4962. + st.w r12[0], r5 // dst = { d00, d01, d02, d03 }
  4963. + st.w r12[4], r4 // dst = { d04, d05, d06, d07 }
  4964. + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
  4965. + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
  4966. + add r11, r10 // src += stride
  4967. +#ifdef USE_PREFETCH
  4968. + pref r11[0]
  4969. +#endif
  4970. + add r12, r10 // dst += stride
  4971. + sub r9, 2
  4972. + brne 0b
  4973. +
  4974. + popm r4-r7, pc
  4975. + .endm
  4976. +
  4977. + /* Macro for 8 pixel wide copy functions */
  4978. + .macro pixels8 put
  4979. + stm --sp, r3-r7,lr
  4980. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4981. + mov lr, r9
  4982. + sub r3, r10, 2 // stride2 = stride - 2
  4983. +0:
  4984. + .if \put
  4985. + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
  4986. + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
  4987. + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
  4988. + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
  4989. + .else
  4990. + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
  4991. + ld.d r4, r12[0]
  4992. + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
  4993. + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
  4994. + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
  4995. + pavg.ub r6, r6, r4
  4996. + pavg.ub r7, r7, r5
  4997. + ld.d r4, r12[r10]
  4998. + .endif
  4999. + st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 }
  5000. + add r11, r11, r3 << 1 // src += stride2 * 2
  5001. + .ifeq \put
  5002. + pavg.ub r8, r8, r4
  5003. + pavg.ub r9, r9, r5
  5004. + .endif
  5005. + st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 }
  5006. + add r12, r12, r10 << 1 // dst += 2*stride
  5007. + sub lr, 2
  5008. + brne 0b
  5009. + ldm sp++, r3-r7,pc
  5010. +
  5011. + .endm
  5012. +
  5013. + .global put_no_rnd_pixels8_hv_avr32
  5014. + .text
  5015. +put_no_rnd_pixels8_hv_avr32:
  5016. + pixels8_hv 0, 1
  5017. +
  5018. + .global put_pixels8_hv_avr32
  5019. + .text
  5020. +put_pixels8_hv_avr32:
  5021. + pixels8_hv 1, 1
  5022. +
  5023. + .global avg_no_rnd_pixels8_hv_avr32
  5024. + .text
  5025. +avg_no_rnd_pixels8_hv_avr32:
  5026. + pixels8_hv 0, 0
  5027. +
  5028. + .global avg_pixels8_hv_avr32
  5029. + .text
  5030. +avg_pixels8_hv_avr32:
  5031. + pixels8_hv 1, 0
  5032. +
  5033. + .global put_no_rnd_pixels8_v_avr32
  5034. + .text
  5035. +put_no_rnd_pixels8_v_avr32:
  5036. + pixels8_v 0, 1
  5037. +
  5038. + .global put_pixels8_v_avr32
  5039. + .text
  5040. +put_pixels8_v_avr32:
  5041. + pixels8_v 1, 1
  5042. +
  5043. + .global avg_no_rnd_pixels8_v_avr32
  5044. + .text
  5045. +avg_no_rnd_pixels8_v_avr32:
  5046. + pixels8_v 0, 0
  5047. +
  5048. + .global avg_pixels8_v_avr32
  5049. + .text
  5050. +avg_pixels8_v_avr32:
  5051. + pixels8_v 1, 0
  5052. +
  5053. + .global put_no_rnd_pixels8_h_avr32
  5054. + .text
  5055. +put_no_rnd_pixels8_h_avr32:
  5056. + pixels8_h 0, 1
  5057. +
  5058. + .global put_pixels8_h_avr32
  5059. + .text
  5060. +put_pixels8_h_avr32:
  5061. + pixels8_h 1, 1
  5062. +
  5063. + .global avg_no_rnd_pixels8_h_avr32
  5064. + .text
  5065. +avg_no_rnd_pixels8_h_avr32:
  5066. + pixels8_h 0, 0
  5067. +
  5068. + .global avg_pixels8_h_avr32
  5069. + .text
  5070. +avg_pixels8_h_avr32:
  5071. + pixels8_h 1, 0
  5072. +
  5073. + .global put_pixels8_avr32
  5074. + .global put_no_rnd_pixels8_avr32
  5075. + .text
  5076. +put_pixels8_avr32:
  5077. +put_no_rnd_pixels8_avr32:
  5078. + pixels8 1
  5079. +
  5080. + .global avg_no_rnd_pixels8_avr32
  5081. + .global avg_pixels8_avr32
  5082. + .text
  5083. +avg_pixels8_avr32:
  5084. +avg_no_rnd_pixels8_avr32:
  5085. + pixels8 0
  5086. --- /dev/null
  5087. +++ b/libavcodec/avr32/pico.h
  5088. @@ -0,0 +1,260 @@
  5089. +/*
  5090. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  5091. + *
  5092. + * Redistribution and use in source and binary forms, with or without
  5093. + * modification, are permitted provided that the following conditions
  5094. + * are met:
  5095. + *
  5096. + * 1. Redistributions of source code must retain the above copyright
  5097. + * notice, this list of conditions and the following disclaimer.
  5098. + *
  5099. + * 2. Redistributions in binary form must reproduce the above
  5100. + * copyright notice, this list of conditions and the following
  5101. + * disclaimer in the documentation and/or other materials provided
  5102. + * with the distribution.
  5103. + *
  5104. + * 3. The name of ATMEL may not be used to endorse or promote products
  5105. + * derived from this software without specific prior written
  5106. + * permission.
  5107. + *
  5108. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  5109. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  5110. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  5111. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  5112. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  5113. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  5114. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  5115. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  5116. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  5117. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  5118. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  5119. + * DAMAGE.
  5120. + */
  5121. +#ifndef __PICO_H__
  5122. +#define __PICO_H__
  5123. +
  5124. +
  5125. +
  5126. +/* Coprocessor Number */
  5127. +#define PICO_CPNO 1
  5128. +
  5129. +/* Pixel Coprocessor Register file */
  5130. +#define PICO_REGVECT_INPIX2 cr0
  5131. +#define PICO_REGVECT_INPIX1 cr1
  5132. +#define PICO_REGVECT_INPIX0 cr2
  5133. +#define PICO_REGVECT_OUTPIX2 cr3
  5134. +#define PICO_REGVECT_OUTPIX1 cr4
  5135. +#define PICO_REGVECT_OUTPIX0 cr5
  5136. +#define PICO_REGVECT_COEFF0_A cr6
  5137. +#define PICO_REGVECT_COEFF0_B cr7
  5138. +#define PICO_REGVECT_COEFF1_A cr8
  5139. +#define PICO_REGVECT_COEFF1_B cr9
  5140. +#define PICO_REGVECT_COEFF2_A cr10
  5141. +#define PICO_REGVECT_COEFF2_B cr11
  5142. +#define PICO_REGVECT_VMU0_OUT cr12
  5143. +#define PICO_REGVECT_VMU1_OUT cr13
  5144. +#define PICO_REGVECT_VMU2_OUT cr14
  5145. +#define PICO_REGVECT_CONFIG cr15
  5146. +
  5147. +#define PICO_INPIX2 0
  5148. +#define PICO_INPIX1 1
  5149. +#define PICO_INPIX0 2
  5150. +#define PICO_OUTPIX2 3
  5151. +#define PICO_OUTPIX1 4
  5152. +#define PICO_OUTPIX0 5
  5153. +#define PICO_COEFF0_A 6
  5154. +#define PICO_COEFF0_B 7
  5155. +#define PICO_COEFF1_A 8
  5156. +#define PICO_COEFF1_B 9
  5157. +#define PICO_COEFF2_A 10
  5158. +#define PICO_COEFF2_B 11
  5159. +#define PICO_VMU0_OUT 12
  5160. +#define PICO_VMU1_OUT 13
  5161. +#define PICO_VMU2_OUT 14
  5162. +#define PICO_CONFIG 15
  5163. +
  5164. +/* Config Register */
  5165. +#define PICO_COEFF_FRAC_BITS_OFFSET 0
  5166. +#define PICO_COEFF_FRAC_BITS_SIZE 4
  5167. +#define PICO_OFFSET_FRAC_BITS_OFFSET 4
  5168. +#define PICO_OFFSET_FRAC_BITS_SIZE 4
  5169. +#define PICO_INPUT_MODE_OFFSET 8
  5170. +#define PICO_INPUT_MODE_SIZE 2
  5171. +#define PICO_OUTPUT_MODE_OFFSET 10
  5172. +#define PICO_OUTPUT_MODE_SIZE 1
  5173. +
  5174. +struct pico_config_t {
  5175. + unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE;
  5176. + unsigned int output_mode : PICO_OUTPUT_MODE_SIZE;
  5177. + unsigned int input_mode : PICO_INPUT_MODE_SIZE;
  5178. + unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE;
  5179. + unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE;
  5180. + int vmu2_out;
  5181. + int vmu1_out;
  5182. + int vmu0_out;
  5183. + short coeff2_2;
  5184. + short coeff2_3;
  5185. + short coeff2_0;
  5186. + short coeff2_1;
  5187. + short coeff1_2;
  5188. + short coeff1_3;
  5189. + short coeff1_0;
  5190. + short coeff1_1;
  5191. + short coeff0_2;
  5192. + short coeff0_3;
  5193. + short coeff0_0;
  5194. + short coeff0_1;
  5195. +};
  5196. +
  5197. +
  5198. +#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET)
  5199. +#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET)
  5200. +#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET)
  5201. +#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET)
  5202. +
  5203. +#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1))
  5204. +#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1))
  5205. +#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1))
  5206. +#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1))
  5207. +
  5208. +enum pico_input_mode { PICO_TRANSFORMATION_MODE,
  5209. + PICO_HOR_FILTER_MODE,
  5210. + PICO_VERT_FILTER_MODE };
  5211. +
  5212. +enum pico_output_mode { PICO_PACKED_MODE,
  5213. + PICO_PLANAR_MODE };
  5214. +
  5215. +/* Bits in coefficients */
  5216. +#define PICO_COEFF_BITS 12
  5217. +
  5218. +/* Operation bits */
  5219. +#define PICO_MATRIX (0)
  5220. +#define PICO_USE_ACC (1 << 2)
  5221. +#define PICO_SINGLE_VECTOR (1 << 3)
  5222. +
  5223. +
  5224. +#define __str(x...) #x
  5225. +#define __xstr(x...) __str(x)
  5226. +
  5227. +#define PICO_PUT_W(pico_reg, x) \
  5228. + __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
  5229. +#define PICO_GET_W(pico_reg) \
  5230. + __builtin_mvcr_w(PICO_CPNO, pico_reg)
  5231. +
  5232. +#define PICO_MVCR_W(x, pico_reg) \
  5233. + asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
  5234. +
  5235. +#define PICO_MVRC_W(pico_reg, x) \
  5236. + asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
  5237. +
  5238. +#define PICO_PUT_D(pico_reg, x) \
  5239. + __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
  5240. +#define PICO_GET_D(pico_reg) \
  5241. + __builtin_mvcr_d(PICO_CPNO, pico_reg)
  5242. +
  5243. +#define PICO_MVCR_D(x, pico_reg) \
  5244. + asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
  5245. +#define PICO_MVRC_D(pico_reg, x) \
  5246. + asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
  5247. +
  5248. +#define PICO_STCM_W(ptr, pico_regs...) \
  5249. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5250. +#define PICO_STCM_D(ptr, pico_regs...) \
  5251. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5252. +
  5253. +#define PICO_STCM_W_DEC(ptr, pico_regs...) \
  5254. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5255. +#define PICO_STCM_D_DEC(ptr, pico_regs...) \
  5256. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5257. +
  5258. +#define PICO_LDCM_W(ptr, pico_regs...) \
  5259. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5260. +#define PICO_LDCM_D(ptr, pico_regs...) \
  5261. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5262. +
  5263. +#define PICO_LDCM_W_INC(ptr, pico_regs...) \
  5264. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5265. +#define PICO_LDCM_D_INC(ptr, pico_regs...) \
  5266. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5267. +
  5268. +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
  5269. + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
  5270. +
  5271. +static inline void set_pico_config(struct pico_config_t *config){
  5272. + PICO_LDCM_D(config,
  5273. + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
  5274. + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
  5275. + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
  5276. + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
  5277. + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
  5278. +}
  5279. +
  5280. +static inline void get_pico_config(struct pico_config_t *config){
  5281. + PICO_STCM_D(config,
  5282. + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
  5283. + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
  5284. + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
  5285. + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
  5286. + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
  5287. +}
  5288. +
  5289. +static inline void dump_pico_config(){
  5290. + struct pico_config_t pico_config;
  5291. + char *input_mode, *output_mode;
  5292. + get_pico_config(&pico_config);
  5293. +
  5294. +
  5295. + av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n");
  5296. + av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits);
  5297. + av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits);
  5298. +
  5299. + switch ( pico_config.input_mode ){
  5300. + case PICO_TRANSFORMATION_MODE:
  5301. + input_mode = "Transformation Mode";
  5302. + break;
  5303. + case PICO_HOR_FILTER_MODE:
  5304. + input_mode = "Horisontal Filter Mode";
  5305. + break;
  5306. + case PICO_VERT_FILTER_MODE:
  5307. + input_mode = "Vertical Filter Mode";
  5308. + break;
  5309. + default:
  5310. + input_mode = "Unknown Mode!!";
  5311. + break;
  5312. + }
  5313. + av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode);
  5314. +
  5315. + switch ( pico_config.output_mode ){
  5316. + case PICO_PLANAR_MODE:
  5317. + output_mode = "Planar Mode";
  5318. + break;
  5319. + case PICO_PACKED_MODE:
  5320. + output_mode = "Packed Mode";
  5321. + break;
  5322. + default:
  5323. + output_mode = "Unknown Mode!!";
  5324. + break;
  5325. + }
  5326. +
  5327. + av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode);
  5328. +
  5329. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits));
  5330. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits));
  5331. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits));
  5332. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits));
  5333. +
  5334. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits));
  5335. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits));
  5336. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits));
  5337. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits));
  5338. +
  5339. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits));
  5340. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits));
  5341. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits));
  5342. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits));
  5343. +}
  5344. +
  5345. +
  5346. +
  5347. +#endif
  5348. +
  5349. --- a/libavcodec/bitstream.h
  5350. +++ b/libavcodec/bitstream.h
  5351. @@ -171,7 +171,7 @@ typedef struct RL_VLC_ELEM {
  5352. #endif
  5353. /* used to avoid missaligned exceptions on some archs (alpha, ...) */
  5354. -#if defined(ARCH_X86) || defined(ARCH_X86_64)
  5355. +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_AVR32)
  5356. # define unaligned16(a) (*(const uint16_t*)(a))
  5357. # define unaligned32(a) (*(const uint32_t*)(a))
  5358. # define unaligned64(a) (*(const uint64_t*)(a))
  5359. @@ -813,6 +813,44 @@ void free_vlc(VLC *vlc);
  5360. * if the vlc code is invalid and max_depth>1 than the number of bits removed
  5361. * is undefined
  5362. */
  5363. +
  5364. +#if defined(ARCH_AVR32)
  5365. +#define GET_VLC(code, name, gb, table, bits, max_depth)\
  5366. +{\
  5367. + int n, index, nb_bits;\
  5368. + union { VLC_TYPE vlc[2];\
  5369. + uint32_t u32; } table_elem;\
  5370. +\
  5371. + index= SHOW_UBITS(name, gb, bits);\
  5372. + table_elem.u32 = unaligned32(&table[index]); \
  5373. + code = table_elem.vlc[0];\
  5374. + n = table_elem.vlc[1];\
  5375. +\
  5376. + if(max_depth > 1 && n < 0 ){\
  5377. + LAST_SKIP_BITS(name, gb, bits)\
  5378. + UPDATE_CACHE(name, gb)\
  5379. +\
  5380. + nb_bits = -n;\
  5381. +\
  5382. + index= SHOW_UBITS(name, gb, nb_bits) + code;\
  5383. + table_elem.u32 = unaligned32(&table[index]); \
  5384. + code = table_elem.vlc[0];\
  5385. + n = table_elem.vlc[1];\
  5386. + if(max_depth > 2 && n < 0){\
  5387. + LAST_SKIP_BITS(name, gb, nb_bits)\
  5388. + UPDATE_CACHE(name, gb)\
  5389. +\
  5390. + nb_bits = -n;\
  5391. +\
  5392. + index= SHOW_UBITS(name, gb, nb_bits) + code;\
  5393. + code = table[index][0];\
  5394. + n = table[index][1];\
  5395. + }\
  5396. + }\
  5397. + SKIP_BITS(name, gb, n)\
  5398. +}
  5399. +
  5400. +#else
  5401. #define GET_VLC(code, name, gb, table, bits, max_depth)\
  5402. {\
  5403. int n, index, nb_bits;\
  5404. @@ -821,7 +859,7 @@ void free_vlc(VLC *vlc);
  5405. code = table[index][0];\
  5406. n = table[index][1];\
  5407. \
  5408. - if(max_depth > 1 && n < 0){\
  5409. + if(max_depth > 1 && n < 0 ){\
  5410. LAST_SKIP_BITS(name, gb, bits)\
  5411. UPDATE_CACHE(name, gb)\
  5412. \
  5413. @@ -843,7 +881,38 @@ void free_vlc(VLC *vlc);
  5414. }\
  5415. SKIP_BITS(name, gb, n)\
  5416. }
  5417. +#endif
  5418. +#if defined(ARCH_AVR32)
  5419. +#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
  5420. +{\
  5421. + int n, index, nb_bits;\
  5422. + union { RL_VLC_ELEM vlc;\
  5423. + uint32_t u32; } table_elem;\
  5424. +\
  5425. + index= SHOW_UBITS(name, gb, bits);\
  5426. + table_elem.u32 = unaligned32(&table[index]); \
  5427. + level = table_elem.vlc.level;\
  5428. + n = table_elem.vlc.len;\
  5429. +\
  5430. + if(max_depth > 1 && n < 0 ){\
  5431. + SKIP_BITS(name, gb, bits)\
  5432. + if(need_update){\
  5433. + UPDATE_CACHE(name, gb)\
  5434. + }\
  5435. +\
  5436. + nb_bits = -n;\
  5437. +\
  5438. + index= SHOW_UBITS(name, gb, nb_bits) + level;\
  5439. + table_elem.u32 = unaligned32(&table[index]); \
  5440. + level = table_elem.vlc.level;\
  5441. + n = table_elem.vlc.len;\
  5442. + }\
  5443. + run= table_elem.vlc.run;\
  5444. + SKIP_BITS(name, gb, n)\
  5445. +}
  5446. +
  5447. +#else
  5448. #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
  5449. {\
  5450. int n, index, nb_bits;\
  5451. @@ -852,7 +921,7 @@ void free_vlc(VLC *vlc);
  5452. level = table[index].level;\
  5453. n = table[index].len;\
  5454. \
  5455. - if(max_depth > 1 && n < 0){\
  5456. + if(max_depth > 1 && n < 0 ){\
  5457. SKIP_BITS(name, gb, bits)\
  5458. if(need_update){\
  5459. UPDATE_CACHE(name, gb)\
  5460. @@ -867,7 +936,7 @@ void free_vlc(VLC *vlc);
  5461. run= table[index].run;\
  5462. SKIP_BITS(name, gb, n)\
  5463. }
  5464. -
  5465. +#endif
  5466. /**
  5467. * parses a vlc code, faster then get_vlc()
  5468. --- a/libavcodec/dsputil.c
  5469. +++ b/libavcodec/dsputil.c
  5470. @@ -4197,6 +4197,9 @@ void dsputil_init(DSPContext* c, AVCodec
  5471. #ifdef ARCH_BFIN
  5472. dsputil_init_bfin(c,avctx);
  5473. #endif
  5474. +#ifdef ARCH_AVR32
  5475. + dsputil_init_avr32(c,avctx);
  5476. +#endif
  5477. for(i=0; i<64; i++){
  5478. if(!c->put_2tap_qpel_pixels_tab[0][i])
  5479. --- a/libavcodec/h264.c
  5480. +++ b/libavcodec/h264.c
  5481. @@ -3258,7 +3258,12 @@ static void free_tables(H264Context *h){
  5482. static void init_dequant8_coeff_table(H264Context *h){
  5483. int i,q,x;
  5484. +#ifdef ARCH_AVR32
  5485. + const int transpose = 0;
  5486. +#else
  5487. const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
  5488. +#endif
  5489. +
  5490. h->dequant8_coeff[0] = h->dequant8_buffer[0];
  5491. h->dequant8_coeff[1] = h->dequant8_buffer[1];
  5492. @@ -3281,7 +3286,13 @@ static void init_dequant8_coeff_table(H2
  5493. static void init_dequant4_coeff_table(H264Context *h){
  5494. int i,j,q,x;
  5495. + // Yes this is ugly as hell....
  5496. +#ifdef ARCH_AVR32
  5497. + const int transpose = 0;
  5498. +#else
  5499. const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
  5500. +#endif
  5501. +
  5502. for(i=0; i<6; i++ ){
  5503. h->dequant4_coeff[i] = h->dequant4_buffer[i];
  5504. for(j=0; j<i; j++){
  5505. @@ -4663,7 +4674,11 @@ static int decode_slice_header(H264Conte
  5506. if (MPV_common_init(s) < 0)
  5507. return -1;
  5508. +#ifdef ARCH_AVR32
  5509. + if ( 1 ){
  5510. +#else
  5511. if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
  5512. +#endif
  5513. memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
  5514. memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t));
  5515. }else{
  5516. --- a/libavutil/common.h
  5517. +++ b/libavutil/common.h
  5518. @@ -283,23 +283,39 @@ static inline int mid_pred(int a, int b,
  5519. * @param amax maximum value of the clip range
  5520. * @return cliped value
  5521. */
  5522. +#if defined(ARCH_AVR32)
  5523. +#define clip(a, amin, amax) \
  5524. + ({ int __tmp__; \
  5525. + asm ("min\t%0, %1, %2\n" \
  5526. + "max\t%0, %0, %3\n" \
  5527. + : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \
  5528. + __tmp__; })
  5529. +#else
  5530. static inline int clip(int a, int amin, int amax)
  5531. {
  5532. if (a < amin) return amin;
  5533. else if (a > amax) return amax;
  5534. else return a;
  5535. }
  5536. +#endif
  5537. /**
  5538. * clip a signed integer value into the 0-255 range
  5539. * @param a value to clip
  5540. * @return cliped value
  5541. */
  5542. +#if defined(ARCH_AVR32)
  5543. +#define clip_uint8(a) \
  5544. + ({ int __tmp__ = a; \
  5545. + asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \
  5546. + __tmp__; })
  5547. +#else
  5548. static inline uint8_t clip_uint8(int a)
  5549. {
  5550. if (a&(~255)) return (-a)>>31;
  5551. else return a;
  5552. }
  5553. +#endif
  5554. /* math */
  5555. int64_t ff_gcd(int64_t a, int64_t b);
  5556. --- a/libavutil/internal.h
  5557. +++ b/libavutil/internal.h
  5558. @@ -210,6 +210,15 @@ if((y)<(x)){\
  5559. }\
  5560. }
  5561. +/* XXX: Hack for uclibc which declares lrintf but does not implement it... */
  5562. +#ifdef ARCH_AVR32
  5563. +#undef HAVE_LRINTF
  5564. +#define HAVE_LRINTF 1
  5565. +#define lrintf(x) rint(x)
  5566. +#define llrint(x) (long long)rint(x)
  5567. +#endif
  5568. +
  5569. +
  5570. #ifndef HAVE_LRINTF
  5571. /* XXX: add ISOC specific test to avoid specific BSD testing. */
  5572. /* better than nothing implementation. */
  5573. --- a/libfaad2/common.h
  5574. +++ b/libfaad2/common.h
  5575. @@ -67,7 +67,7 @@ extern "C" {
  5576. /* Use if target platform has address generators with autoincrement */
  5577. //#define PREFER_POINTERS
  5578. -#if defined(_WIN32_WCE) || defined(__arm__)
  5579. +#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__)
  5580. #define FIXED_POINT
  5581. #endif
  5582. --- a/libmpcodecs/ad_libmad.c
  5583. +++ b/libmpcodecs/ad_libmad.c
  5584. @@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){
  5585. sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2;
  5586. sh->samplerate=this->frame.header.samplerate;
  5587. sh->i_bps=this->frame.header.bitrate/8;
  5588. +#ifdef WORDS_BIGENDIAN
  5589. + sh->sample_format = AF_FORMAT_S16_BE;
  5590. +#else
  5591. + sh->sample_format = AF_FORMAT_S16_LE;
  5592. +#endif
  5593. sh->samplesize=2;
  5594. return 1;
  5595. --- /dev/null
  5596. +++ b/libswscale/pico-avr32.h
  5597. @@ -0,0 +1,137 @@
  5598. +/*
  5599. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  5600. + *
  5601. + * Redistribution and use in source and binary forms, with or without
  5602. + * modification, are permitted provided that the following conditions
  5603. + * are met:
  5604. + *
  5605. + * 1. Redistributions of source code must retain the above copyright
  5606. + * notice, this list of conditions and the following disclaimer.
  5607. + *
  5608. + * 2. Redistributions in binary form must reproduce the above
  5609. + * copyright notice, this list of conditions and the following
  5610. + * disclaimer in the documentation and/or other materials provided
  5611. + * with the distribution.
  5612. + *
  5613. + * 3. The name of ATMEL may not be used to endorse or promote products
  5614. + * derived from this software without specific prior written
  5615. + * permission.
  5616. + *
  5617. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  5618. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  5619. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  5620. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  5621. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  5622. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  5623. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  5624. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  5625. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  5626. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  5627. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  5628. + * DAMAGE.
  5629. + */
  5630. +#ifndef __PICO_H__
  5631. +#define __PICO_H__
  5632. +
  5633. +/* Coprocessor Number */
  5634. +#define PICO_CPNO 1
  5635. +
  5636. +/* Pixel Coprocessor Register file */
  5637. +#define PICO_REGVECT_INPIX2 cr0
  5638. +#define PICO_REGVECT_INPIX1 cr1
  5639. +#define PICO_REGVECT_INPIX0 cr2
  5640. +#define PICO_REGVECT_OUTPIX2 cr3
  5641. +#define PICO_REGVECT_OUTPIX1 cr4
  5642. +#define PICO_REGVECT_OUTPIX0 cr5
  5643. +#define PICO_REGVECT_COEFF0_A cr6
  5644. +#define PICO_REGVECT_COEFF0_B cr7
  5645. +#define PICO_REGVECT_COEFF1_A cr8
  5646. +#define PICO_REGVECT_COEFF1_B cr9
  5647. +#define PICO_REGVECT_COEFF2_A cr10
  5648. +#define PICO_REGVECT_COEFF2_B cr11
  5649. +#define PICO_REGVECT_VMU0_OUT cr12
  5650. +#define PICO_REGVECT_VMU1_OUT cr13
  5651. +#define PICO_REGVECT_VMU2_OUT cr14
  5652. +#define PICO_REGVECT_CONFIG cr15
  5653. +
  5654. +#define PICO_INPIX2 0
  5655. +#define PICO_INPIX1 1
  5656. +#define PICO_INPIX0 2
  5657. +#define PICO_OUTPIX2 3
  5658. +#define PICO_OUTPIX1 4
  5659. +#define PICO_OUTPIX0 5
  5660. +#define PICO_COEFF0_A 6
  5661. +#define PICO_COEFF0_B 7
  5662. +#define PICO_COEFF1_A 8
  5663. +#define PICO_COEFF1_B 9
  5664. +#define PICO_COEFF2_A 10
  5665. +#define PICO_COEFF2_B 11
  5666. +#define PICO_VMU0_OUT 12
  5667. +#define PICO_VMU1_OUT 13
  5668. +#define PICO_VMU2_OUT 14
  5669. +#define PICO_CONFIG 15
  5670. +
  5671. +/* Config Register */
  5672. +#define PICO_COEFF_FRAC_BITS 0
  5673. +#define PICO_COEFF_FRAC_BITS_WIDTH 4
  5674. +#define PICO_OFFSET_FRAC_BITS 4
  5675. +#define PICO_OFFSET_FRAC_BITS_WIDTH 4
  5676. +#define PICO_INPUT_MODE 8
  5677. +#define PICO_INPUT_MODE_WIDTH 2
  5678. +#define PICO_OUTPUT_MODE 10
  5679. +
  5680. +#define PICO_TRANSFORMATION_MODE 0
  5681. +#define PICO_HOR_FILTER_MODE 1
  5682. +#define PICO_VERT_FILTER_MODE 2
  5683. +
  5684. +#define PICO_PLANAR_MODE 1
  5685. +#define PICO_PACKED_MODE 0
  5686. +
  5687. +/* Bits in coefficients */
  5688. +#define PICO_COEFF_BITS 12
  5689. +
  5690. +/* Operation bits */
  5691. +#define PICO_USE_ACC (1 << 2)
  5692. +#define PICO_SINGLE_VECTOR (1 << 3)
  5693. +
  5694. +
  5695. +#define __str(x...) #x
  5696. +#define __xstr(x...) __str(x)
  5697. +
  5698. +#define PICO_PUT_W(pico_reg, x) \
  5699. + __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
  5700. +#define PICO_GET_W(pico_reg) \
  5701. + __builtin_mvcr_w(PICO_CPNO, pico_reg)
  5702. +
  5703. +#define PICO_PUT_D(pico_reg, x) \
  5704. + __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
  5705. +#define PICO_GET_D(pico_reg) \
  5706. + __builtin_mvcr_d(PICO_CPNO, pico_reg)
  5707. +
  5708. +
  5709. +#define PICO_STCM_W(ptr, pico_regs...) \
  5710. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5711. +#define PICO_STCM_D(ptr, pico_regs...) \
  5712. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5713. +
  5714. +#define PICO_STCM_W_DEC(ptr, pico_regs...) \
  5715. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5716. +#define PICO_STCM_D_DEC(ptr, pico_regs...) \
  5717. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5718. +
  5719. +#define PICO_LDCM_W(ptr, pico_regs...) \
  5720. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5721. +#define PICO_LDCM_D(ptr, pico_regs...) \
  5722. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5723. +
  5724. +#define PICO_LDCM_W_INC(ptr, pico_regs...) \
  5725. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5726. +#define PICO_LDCM_D_INC(ptr, pico_regs...) \
  5727. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5728. +
  5729. +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
  5730. + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
  5731. +
  5732. +
  5733. +#endif
  5734. +
  5735. --- a/libswscale/swscale_internal.h
  5736. +++ b/libswscale/swscale_internal.h
  5737. @@ -173,7 +173,7 @@ typedef struct SwsContext{
  5738. SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
  5739. int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
  5740. -char *sws_format_name(int format);
  5741. +char *sws_format_name(enum PixelFormat format);
  5742. //FIXME replace this with something faster
  5743. #define isPlanarYUV(x) ((x)==PIX_FMT_YUV410P || (x)==PIX_FMT_YUV420P \
  5744. --- a/libswscale/yuv2rgb.c
  5745. +++ b/libswscale/yuv2rgb.c
  5746. @@ -44,6 +44,10 @@
  5747. #include "yuv2rgb_mlib.c"
  5748. #endif
  5749. +#ifdef ARCH_AVR32
  5750. +#include "yuv2rgb_avr32.c"
  5751. +#endif
  5752. +
  5753. #define DITHER1XBPP // only for mmx
  5754. const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
  5755. @@ -601,6 +605,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext
  5756. if(t) return t;
  5757. }
  5758. #endif
  5759. +#ifdef ARCH_AVR32
  5760. + {
  5761. + SwsFunc t= yuv2rgb_init_avr32(c);
  5762. + if(t) return t;
  5763. + }
  5764. +#endif
  5765. #ifdef HAVE_ALTIVEC
  5766. if (c->flags & SWS_CPU_CAPS_ALTIVEC)
  5767. {
  5768. @@ -678,6 +688,10 @@ int yuv2rgb_c_init_tables (SwsContext *c
  5769. //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
  5770. oy -= 256*brightness;
  5771. +#ifdef ARCH_AVR32
  5772. + yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation);
  5773. +#endif
  5774. +
  5775. for (i = 0; i < 1024; i++) {
  5776. int j;
  5777. --- /dev/null
  5778. +++ b/libswscale/yuv2rgb_avr32.c
  5779. @@ -0,0 +1,416 @@
  5780. +/*
  5781. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  5782. + *
  5783. + * Redistribution and use in source and binary forms, with or without
  5784. + * modification, are permitted provided that the following conditions
  5785. + * are met:
  5786. + *
  5787. + * 1. Redistributions of source code must retain the above copyright
  5788. + * notice, this list of conditions and the following disclaimer.
  5789. + *
  5790. + * 2. Redistributions in binary form must reproduce the above
  5791. + * copyright notice, this list of conditions and the following
  5792. + * disclaimer in the documentation and/or other materials provided
  5793. + * with the distribution.
  5794. + *
  5795. + * 3. The name of ATMEL may not be used to endorse or promote products
  5796. + * derived from this software without specific prior written
  5797. + * permission.
  5798. + *
  5799. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  5800. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  5801. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  5802. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  5803. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  5804. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  5805. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  5806. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  5807. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  5808. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  5809. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  5810. + * DAMAGE.
  5811. + */
  5812. +#include "pico-avr32.h"
  5813. +
  5814. +
  5815. +#define RGB(uv_part) \
  5816. + __asm__ volatile ( \
  5817. + "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \
  5818. + "ld.w\t%1, %4[%8:" uv_part " << 2]\n\t" /* g = c->table_gU[U] */ \
  5819. + "ld.w\t%2, %5[%8:" uv_part " << 2]\n\t" /* b = c->table_bU[U] */ \
  5820. + "add\t%1, %0\n\t" /* g += tmp */\
  5821. + "ld.w\t%0, %6[%7:" uv_part " << 2]" /* r = c->table_rV[V] */ \
  5822. + : "=&r" (r), "=&r" (g), "=&r" (b) \
  5823. + : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \
  5824. + "r" (&c->table_rV[0]), "r" (V), "r" (U));
  5825. +
  5826. +
  5827. +#undef YUV2RGB1
  5828. +#define YUV2RGB1(dst, src, y, idx) \
  5829. + { int tmp2; __asm__ volatile ( \
  5830. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5831. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5832. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5833. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
  5834. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5835. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5836. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5837. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
  5838. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5839. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5840. + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
  5841. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5842. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5843. + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
  5844. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5845. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5846. +
  5847. +#undef YUV2RGB2
  5848. +#define YUV2RGB2(dst, src, y, idx) \
  5849. + { int tmp2; __asm__ volatile ( \
  5850. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5851. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5852. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5853. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
  5854. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5855. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5856. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5857. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
  5858. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5859. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5860. + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
  5861. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5862. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5863. + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
  5864. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5865. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5866. +
  5867. +
  5868. +#undef YUV2BGR1
  5869. +#define YUV2BGR1(dst, src, y, idx) \
  5870. + { int tmp2; __asm__ volatile ( \
  5871. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5872. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5873. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5874. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
  5875. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5876. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5877. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5878. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
  5879. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5880. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5881. + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
  5882. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5883. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5884. + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
  5885. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5886. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5887. +
  5888. +#undef YUV2BGR2
  5889. +#define YUV2BGR2(dst, src, y, idx) \
  5890. + { int tmp2; __asm__ volatile ( \
  5891. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5892. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5893. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5894. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
  5895. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5896. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5897. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5898. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
  5899. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5900. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5901. + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
  5902. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5903. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5904. + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
  5905. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5906. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5907. +
  5908. +
  5909. +
  5910. +int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  5911. + int srcSliceH, uint8_t* dst[], int dstStride[]){
  5912. + int y;
  5913. +
  5914. + if(c->srcFormat == PIX_FMT_YUV422P){
  5915. + srcStride[1] *= 2;
  5916. + srcStride[2] *= 2;
  5917. + }
  5918. +
  5919. +
  5920. + for(y=0; y<srcSliceH; y+=2){
  5921. + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
  5922. + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
  5923. + uint32_t *r, *g, *b;
  5924. + uint8_t *py_1= src[0] + y*srcStride[0];
  5925. + uint8_t *py_2= py_1 + srcStride[0];
  5926. + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
  5927. + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
  5928. + unsigned int h_size= c->dstW>>3;
  5929. + while (h_size--) {
  5930. + uint32_t U, V, Y1, Y2, tmp;
  5931. + U = ((uint32_t*)pu)[0];
  5932. + V = ((uint32_t*)pv)[0];
  5933. +
  5934. + RGB("t")
  5935. + YUV2BGR1(dst_1, py_1, Y1, 0)
  5936. + YUV2BGR1(dst_2, py_2, Y2, 0)
  5937. +
  5938. + RGB("u")
  5939. + YUV2BGR2(dst_1, py_1, Y1, 1)
  5940. + YUV2BGR2(dst_2, py_2, Y2, 1)
  5941. +
  5942. + RGB("l")
  5943. + YUV2BGR1(dst_1, py_1, Y1, 2)
  5944. + YUV2BGR1(dst_2, py_2, Y2, 2)
  5945. +
  5946. + RGB("b")
  5947. + YUV2BGR2(dst_1, py_1, Y1, 3)
  5948. + YUV2BGR2(dst_2, py_2, Y2, 3)
  5949. +
  5950. +
  5951. +
  5952. + pu += 4;
  5953. + pv += 4;
  5954. + py_1 += 8;
  5955. + py_2 += 8;
  5956. + dst_1 += 24;
  5957. + dst_2 += 24;
  5958. + }
  5959. + }
  5960. + return srcSliceH;
  5961. +}
  5962. +
  5963. +
  5964. +
  5965. +static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  5966. + int srcSliceH, uint8_t* dst[], int dstStride[]){
  5967. + int y;
  5968. +
  5969. + if(c->srcFormat == PIX_FMT_YUV422P){
  5970. + srcStride[1] *= 2;
  5971. + srcStride[2] *= 2;
  5972. + }
  5973. + for(y=0; y<srcSliceH; y+=2){
  5974. + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
  5975. + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
  5976. + uint8_t *r, *g, *b;
  5977. + uint8_t *py_1= src[0] + y*srcStride[0];
  5978. + uint8_t *py_2= py_1 + srcStride[0];
  5979. + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
  5980. + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
  5981. + unsigned int h_size= c->dstW>>3;
  5982. + while (h_size--) {
  5983. + uint32_t U, V, Y1, Y2, tmp;
  5984. + U = ((uint32_t*)pu)[0];
  5985. + V = ((uint32_t*)pv)[0];
  5986. +
  5987. + RGB("t")
  5988. + YUV2RGB1(dst_1, py_1, Y1, 0)
  5989. + YUV2RGB1(dst_2, py_2, Y2, 0)
  5990. +
  5991. + RGB("u")
  5992. + YUV2RGB2(dst_1, py_1, Y1, 1)
  5993. + YUV2RGB2(dst_2, py_2, Y2, 1)
  5994. +
  5995. + RGB("l")
  5996. + YUV2RGB1(dst_1, py_1, Y1, 2)
  5997. + YUV2RGB1(dst_2, py_2, Y2, 2)
  5998. +
  5999. + RGB("b")
  6000. + YUV2RGB2(dst_1, py_1, Y1, 3)
  6001. + YUV2RGB2(dst_2, py_2, Y2, 3)
  6002. +
  6003. + pu += 4;
  6004. + pv += 4;
  6005. + py_1 += 8;
  6006. + py_2 += 8;
  6007. + dst_1 += 24;
  6008. + dst_2 += 24;
  6009. + }
  6010. + }
  6011. + return srcSliceH;
  6012. +}
  6013. +
  6014. +#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits)
  6015. +#define COEFF_FRAC_BITS 9
  6016. +#define OFFSET_FRAC_BITS 2
  6017. +
  6018. +/* Coefficients used in the pico */
  6019. +static struct {
  6020. + short coeff2_2;
  6021. + short coeff2_3;
  6022. + short coeff2_0;
  6023. + short coeff2_1;
  6024. + short coeff1_2;
  6025. + short coeff1_3;
  6026. + short coeff1_0;
  6027. + short coeff1_1;
  6028. + short coeff0_2;
  6029. + short coeff0_3;
  6030. + short coeff0_0;
  6031. + short coeff0_1;
  6032. +} pico_coeff;
  6033. +
  6034. +
  6035. +static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  6036. + int srcSliceH, uint8_t* dst[], int dstStride[]){
  6037. + int y;
  6038. + static int first_time = 1;
  6039. +
  6040. + /* Initialize pico */
  6041. + PICO_LDCM_D(&pico_coeff,
  6042. + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
  6043. + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
  6044. + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B);
  6045. +
  6046. + PICO_PUT_W(PICO_CONFIG,
  6047. + (PICO_PACKED_MODE << PICO_OUTPUT_MODE
  6048. + | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE
  6049. + | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS
  6050. + | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS));
  6051. +
  6052. +
  6053. + if(c->srcFormat == PIX_FMT_YUV422P){
  6054. + srcStride[1] *= 2;
  6055. + srcStride[2] *= 2;
  6056. + }
  6057. +
  6058. + for(y=0; y<srcSliceH; y+=2){
  6059. + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
  6060. + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
  6061. + uint8_t *r, *g, *b;
  6062. + uint8_t *py_1= src[0] + y*srcStride[0];
  6063. + uint8_t *py_2= py_1 + srcStride[0];
  6064. + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
  6065. + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
  6066. + unsigned int h_size= c->dstW>>3;
  6067. + int *py_1_int = (int *)py_1;
  6068. + int *py_2_int = (int *)py_2;
  6069. + int *pu_int = (int *)pu;
  6070. + int *pv_int = (int *)pv;
  6071. + while (h_size--) {
  6072. + PICO_PUT_W(PICO_INPIX0, *py_1_int++);
  6073. + PICO_PUT_W(PICO_INPIX1, *pu_int++);
  6074. + PICO_PUT_W(PICO_INPIX2, *pv_int++);
  6075. + PICO_OP(0, 0, 0, 4, 8);
  6076. + PICO_OP(0, 1, 1, 4, 8);
  6077. + PICO_OP(0, 2, 2, 5, 9);
  6078. + PICO_OP(0, 3, 3, 5, 9);
  6079. + PICO_PUT_W(PICO_INPIX0, *py_1_int++);
  6080. + PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  6081. + PICO_OP(0, 0, 0, 6, 10);
  6082. + PICO_OP(0, 1, 1, 6, 10);
  6083. + PICO_OP(0, 2, 2, 7, 11);
  6084. + PICO_OP(0, 3, 3, 7, 11);
  6085. + PICO_PUT_W(PICO_INPIX0, *py_2_int++);
  6086. + PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  6087. +
  6088. + PICO_OP(0, 0, 0, 4, 8);
  6089. + PICO_OP(0, 1, 1, 4, 8);
  6090. + PICO_OP(0, 2, 2, 5, 9);
  6091. + PICO_OP(0, 3, 3, 5, 9);
  6092. + PICO_PUT_W(PICO_INPIX0, *py_2_int++);
  6093. + PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  6094. + PICO_OP(0, 0, 0, 6, 10);
  6095. + PICO_OP(0, 1, 1, 6, 10);
  6096. + PICO_OP(0, 2, 2, 7, 11);
  6097. + PICO_OP(0, 3, 3, 7, 11);
  6098. + PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  6099. +
  6100. + dst_1 += 24;
  6101. + dst_2 += 24;
  6102. + }
  6103. + }
  6104. + return srcSliceH;
  6105. +}
  6106. +
  6107. +extern int avr32_use_pico;
  6108. +
  6109. +SwsFunc yuv2rgb_init_avr32 (SwsContext *c){
  6110. + switch(c->dstFormat){
  6111. + case PIX_FMT_BGR24:
  6112. + {
  6113. + if ( avr32_use_pico ){
  6114. + MSG_ERR("AVR32 BGR24: Using PICO for color space conversion\n");
  6115. + return yuv2bgr24_avr32_pico;
  6116. + } else {
  6117. + MSG_ERR("AVR32 BGR24: Using optimized color space conversion\n");
  6118. + return yuv2bgr24_avr32;
  6119. + }
  6120. + }
  6121. + break;
  6122. + case PIX_FMT_RGB24:
  6123. + {
  6124. + if ( avr32_use_pico ){
  6125. + MSG_ERR("AVR32 RGB24: Using PICO for color space conversion\n");
  6126. + return yuv2bgr24_avr32_pico;
  6127. + } else {
  6128. + MSG_ERR("AVR32 RGB24: Using optimized color space conversion\n");
  6129. + return yuv2rgb24_avr32;
  6130. + }
  6131. + }
  6132. + }
  6133. + return NULL;
  6134. +}
  6135. +
  6136. +
  6137. +int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){
  6138. + const int isRgb = (c->dstFormat == PIX_FMT_RGB24);
  6139. +
  6140. + int64_t crv = inv_table[0];
  6141. + int64_t cbu = inv_table[1];
  6142. + int64_t cgu = -inv_table[2];
  6143. + int64_t cgv = -inv_table[3];
  6144. + int64_t cy = 1<<16;
  6145. + int64_t oy = 0;
  6146. +
  6147. + if(!fullRange){
  6148. + cy= (cy*255) / 219;
  6149. + oy= 16<<16;
  6150. + }
  6151. +
  6152. + cy = (cy *contrast )>>16;
  6153. + crv= (crv*contrast * saturation)>>32;
  6154. + cbu= (cbu*contrast * saturation)>>32;
  6155. + cgu= (cgu*contrast * saturation)>>32;
  6156. + cgv= (cgv*contrast * saturation)>>32;
  6157. +
  6158. + oy -= 256*brightness;
  6159. +
  6160. + pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */
  6161. + pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */
  6162. + pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */
  6163. + pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS)
  6164. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */
  6165. +
  6166. + if ( isRgb ){
  6167. + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
  6168. + pico_coeff.coeff0_1 = 0; /* R <- U */
  6169. + pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
  6170. + pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
  6171. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
  6172. +
  6173. + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
  6174. + pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
  6175. + pico_coeff.coeff2_2 = 0; /* B <- V */
  6176. + pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
  6177. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */
  6178. + } else {
  6179. + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
  6180. + pico_coeff.coeff2_1 = 0; /* R <- U */
  6181. + pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
  6182. + pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
  6183. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
  6184. +
  6185. + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
  6186. + pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
  6187. + pico_coeff.coeff0_2 = 0; /* B <- V */
  6188. + pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
  6189. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */
  6190. + }
  6191. +
  6192. +}
  6193. +
  6194. +
  6195. +#undef RGB
  6196. --- a/libvo/vo_fbdev2.c
  6197. +++ b/libvo/vo_fbdev2.c
  6198. @@ -22,6 +22,9 @@
  6199. #include "sub.h"
  6200. #include "mp_msg.h"
  6201. +/* Draw directly to framebuffer */
  6202. +#define USE_CONVERT2FB
  6203. +
  6204. static vo_info_t info = {
  6205. "Framebuffer Device",
  6206. "fbdev2",
  6207. @@ -178,6 +181,15 @@ static int fb_preinit(int reset)
  6208. }
  6209. fb_orig_vinfo = fb_vinfo;
  6210. + /* Reset panning offset */
  6211. + fb_vinfo.yoffset = 0;
  6212. + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
  6213. + mp_msg(MSGT_VO, MSGL_ERR,
  6214. + "[fbdev2] FBIOPAN_DISPLAY failed: %s\n",
  6215. + strerror(errno));
  6216. + return 0;
  6217. + }
  6218. +
  6219. fb_bpp = fb_vinfo.bits_per_pixel;
  6220. /* 16 and 15 bpp is reported as 16 bpp */
  6221. @@ -289,6 +301,10 @@ static int config(uint32_t width, uint32
  6222. mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno));
  6223. return 1;
  6224. }
  6225. +#else
  6226. + if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2)
  6227. + && fb_vinfo.yoffset == 0)
  6228. + center += fb_line_len * fb_vinfo.yres;
  6229. #endif
  6230. if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres);
  6231. @@ -299,14 +315,22 @@ static int query_format(uint32_t format)
  6232. {
  6233. // open the device, etc.
  6234. if (fb_preinit(0)) return 0;
  6235. - if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) {
  6236. + if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) {
  6237. int fb_target_bpp = format & 0xff;
  6238. set_bpp(&fb_vinfo, fb_target_bpp);
  6239. fb_vinfo.xres_virtual = fb_vinfo.xres;
  6240. - fb_vinfo.yres_virtual = fb_vinfo.yres;
  6241. + fb_vinfo.yres_virtual = fb_vinfo.yres * 2;
  6242. if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
  6243. - mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno));
  6244. - return 0;
  6245. + mp_msg(MSGT_VO, MSGL_WARN,
  6246. + "[fbdev2] Can't double virtual y resolution: %s\n",
  6247. + strerror(errno));
  6248. + fb_vinfo.yres_virtual = fb_vinfo.yres;
  6249. + if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
  6250. + mp_msg(MSGT_VO, MSGL_ERR,
  6251. + "[fbdev2] Can't put VSCREENINFO: %s\n",
  6252. + strerror(errno));
  6253. + return -1;
  6254. + }
  6255. }
  6256. fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
  6257. fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length +
  6258. @@ -367,16 +391,67 @@ static void check_events(void)
  6259. static void flip_page(void)
  6260. {
  6261. -#ifndef USE_CONVERT2FB
  6262. int i, out_offset = 0, in_offset = 0;
  6263. - for (i = 0; i < in_height; i++) {
  6264. - memcpy(center + out_offset, next_frame + in_offset,
  6265. - in_width * fb_pixel_size);
  6266. - out_offset += fb_line_len;
  6267. - in_offset += in_width * fb_pixel_size;
  6268. - }
  6269. +#ifndef USE_CONVERT2FB
  6270. + if (1) {
  6271. +#else
  6272. + if (fb_vinfo.yres_virtual == fb_vinfo.yres) {
  6273. #endif
  6274. + for (i = 0; i < in_height; i++) {
  6275. + memcpy(center + out_offset, next_frame + in_offset,
  6276. + in_width * fb_pixel_size);
  6277. + out_offset += fb_line_len;
  6278. + in_offset += in_width * fb_pixel_size;
  6279. + }
  6280. + } else {
  6281. + if (fb_vinfo.yoffset == 0) {
  6282. + fb_vinfo.yoffset += fb_vinfo.yres;
  6283. + center -= fb_line_len * fb_vinfo.yres;
  6284. + } else {
  6285. + fb_vinfo.yoffset = 0;
  6286. + center += fb_line_len * fb_vinfo.yres;
  6287. + }
  6288. +
  6289. + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
  6290. + mp_msg(MSGT_VO, MSGL_ERR,
  6291. + "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n",
  6292. + strerror(errno));
  6293. + }
  6294. + }
  6295. +}
  6296. +
  6297. +static uint32_t get_image(mp_image_t *mpi)
  6298. +{
  6299. + if(mpi->flags&MP_IMGFLAG_READABLE)
  6300. + return VO_FALSE; // slow video ram
  6301. + if(mpi->type==MP_IMGTYPE_STATIC)
  6302. + return VO_FALSE; // it is not static
  6303. +
  6304. + if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) {
  6305. + // we're lucky or codec accepts stride => ok, let's go!
  6306. +
  6307. + //YUY2 and RGB formats
  6308. + mpi->planes[0] = center;
  6309. + mpi->width = in_width;
  6310. + mpi->stride[0] = fb_line_len;
  6311. +
  6312. + // center image
  6313. +
  6314. + mpi->flags |= MP_IMGFLAG_DIRECT;
  6315. +
  6316. + return VO_TRUE;
  6317. + }
  6318. +
  6319. + return VO_FALSE;
  6320. +}
  6321. +
  6322. +static uint32_t put_image(mp_image_t *mpi)
  6323. +{
  6324. + // already out?
  6325. + if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK)))
  6326. + return VO_TRUE;
  6327. + return VO_FALSE;
  6328. }
  6329. static void uninit(void)
  6330. @@ -403,6 +478,10 @@ static int control(uint32_t request, voi
  6331. switch (request) {
  6332. case VOCTRL_QUERY_FORMAT:
  6333. return query_format(*((uint32_t*)data));
  6334. + case VOCTRL_GET_IMAGE:
  6335. + return get_image(data);
  6336. + case VOCTRL_DRAW_IMAGE:
  6337. + return put_image(data);
  6338. }
  6339. return VO_NOTIMPL;
  6340. }
  6341. --- a/version.sh
  6342. +++ b/version.sh
  6343. @@ -1,2 +1,2 @@
  6344. #!/bin/sh
  6345. -echo "#define VERSION \"1.0rc1-$1\"" > version.h
  6346. +echo "#define VERSION \"1.0rc1.atmel.2-$1\"" > version.h