2
1

mplayer-1.0rc2-100-atmel.1.patch 206 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262
  1. --- a/cfg-common.h
  2. +++ b/cfg-common.h
  3. @@ -240,6 +240,10 @@
  4. {"psprobe", &ps_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
  5. {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
  6. +#ifdef ARCH_AVR32
  7. + {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
  8. + {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
  9. +#endif
  10. // draw by slices or whole frame (useful with libmpeg2/libavcodec)
  11. {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
  12. {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
  13. --- a/cfg-mencoder.h
  14. +++ b/cfg-mencoder.h
  15. @@ -5,6 +5,10 @@
  16. #include "cfg-common.h"
  17. +#ifdef ARCH_AVR32
  18. +extern int avr32_use_pico;
  19. +#endif
  20. +
  21. #ifdef USE_FAKE_MONO
  22. extern int fakemono; // defined in dec_audio.c
  23. #endif
  24. --- a/cfg-mplayer.h
  25. +++ b/cfg-mplayer.h
  26. @@ -7,6 +7,10 @@
  27. extern int key_fifo_size;
  28. extern unsigned doubleclick_time;
  29. +#ifdef ARCH_AVR32
  30. +extern int avr32_use_pico;
  31. +#endif
  32. +
  33. #ifdef HAVE_FBDEV
  34. extern char *fb_mode_cfgfile;
  35. extern char *fb_mode_name;
  36. --- a/configure
  37. +++ b/configure
  38. @@ -1631,7 +1631,7 @@ EOF
  39. fi
  40. -_arch_all='X86 X86_32 X86_64 IA64 SPARC ARM ARMV4L SH3 POWERPC PPC ALPHA SGI_MIPS PA_RISC S390 S390X VAX BFIN GENERIC'
  41. +_arch_all='X86 X86_32 X86_64 IA64 SPARC ARM ARMV4L AVR32 SH3 POWERPC PPC ALPHA SGI_MIPS PA_RISC S390 S390X VAX BFIN GENERIC'
  42. case "$host_arch" in
  43. i[3-9]86|x86|x86pc|k5|k6|k6-2|k6-3|pentium*|athlon*|i586-i686)
  44. _arch='X86 X86_32'
  45. @@ -1994,6 +1994,16 @@ EOF
  46. _optimizing="$proc"
  47. ;;
  48. + avr32)
  49. + _arch='AVR32'
  50. + _target_arch='ARCH_AVR32 = yes'
  51. + iproc='avr32'
  52. + proc=''
  53. + _march=''
  54. + _mcpu=''
  55. + _optimizing=''
  56. + ;;
  57. +
  58. arm|armv4l|armv5tel)
  59. _arch='ARM ARMV4L'
  60. _target_arch='ARCH_ARMV4L = yes'
  61. --- a/libavcodec/Makefile
  62. +++ b/libavcodec/Makefile
  63. @@ -372,6 +372,11 @@ ASM_OBJS-$(ARCH_ARMV4L) +
  64. OBJS-$(ARCH_ARMV4L) += armv4l/dsputil_arm.o \
  65. armv4l/mpegvideo_arm.o \
  66. +ASM_OBJS-$(ARCH_AVR32) += avr32/idct.o avr32/fdct.o \
  67. + avr32/mc.o avr32/h264idct.o
  68. +
  69. +OBJS-$(ARCH_AVR32) += avr32/dsputil_avr32.o
  70. +
  71. OBJS-$(HAVE_IWMMXT) += armv4l/dsputil_iwmmxt.o \
  72. armv4l/mpegvideo_iwmmxt.o \
  73. @@ -445,6 +450,7 @@ clean::
  74. rm -f \
  75. alpha/*.o alpha/*~ \
  76. armv4l/*.o armv4l/*~ \
  77. + avr32/*.o avr32/*~ \
  78. bfin/*.o bfin/*~ \
  79. i386/*.o i386/*~ \
  80. mlib/*.o mlib/*~ \
  81. --- /dev/null
  82. +++ b/libavcodec/avr32/dsputil_avr32.c
  83. @@ -0,0 +1,2638 @@
  84. +/*
  85. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  86. + *
  87. + * Redistribution and use in source and binary forms, with or without
  88. + * modification, are permitted provided that the following conditions
  89. + * are met:
  90. + *
  91. + * 1. Redistributions of source code must retain the above copyright
  92. + * notice, this list of conditions and the following disclaimer.
  93. + *
  94. + * 2. Redistributions in binary form must reproduce the above
  95. + * copyright notice, this list of conditions and the following
  96. + * disclaimer in the documentation and/or other materials provided
  97. + * with the distribution.
  98. + *
  99. + * 3. The name of ATMEL may not be used to endorse or promote products
  100. + * derived from this software without specific prior written
  101. + * permission.
  102. + *
  103. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  104. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  105. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  106. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  107. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  108. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  109. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  110. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  111. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  112. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  113. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  114. + * DAMAGE.
  115. + */
  116. +
  117. +#include "../dsputil.h"
  118. +#include "pico.h"
  119. +
  120. +int avr32_use_pico = 1;
  121. +
  122. +#ifdef CHECK_DSP_FUNCS_AGAINST_C
  123. +#define DSP_FUNC_NAME(name) test_ ## name
  124. +#else
  125. +#define DSP_FUNC_NAME(name) name
  126. +#endif
  127. +
  128. +union doubleword {
  129. + int64_t doubleword;
  130. + struct {
  131. + int32_t top;
  132. + int32_t bottom;
  133. + } words;
  134. +};
  135. +
  136. +#undef LD16
  137. +#undef LD32
  138. +#undef LD64
  139. +
  140. +#define LD16(a) (*((uint16_t*)(a)))
  141. +#define LD32(a) (*((uint32_t*)(a)))
  142. +#define LD64(a) (*((uint64_t*)(a)))
  143. +#define LD64_UNALIGNED(a) \
  144. + ({ union doubleword __tmp__; \
  145. + __tmp__.words.top = LD32(a); \
  146. + __tmp__.words.bottom = LD32(a + 4); \
  147. + __tmp__.doubleword; })
  148. +
  149. +#undef ST32
  150. +#undef ST16
  151. +
  152. +#define ST16(a, b) *((uint16_t*)(a)) = (b)
  153. +#define ST32(a, b) *((uint32_t*)(a)) = (b)
  154. +
  155. +#undef rnd_avg32
  156. +#define rnd_avg32(a, b) \
  157. + ({ uint32_t __tmp__;\
  158. + asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
  159. + __tmp__;})
  160. +
  161. +void idct_avr32(DCTELEM *data);
  162. +void fdct_avr32(DCTELEM *data);
  163. +
  164. +void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
  165. +void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
  166. +
  167. +void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
  168. +void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
  169. +
  170. +#define extern_dspfunc(PFX, NUM) \
  171. + void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  172. + void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  173. + void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  174. + void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
  175. +
  176. +extern_dspfunc(put, 8);
  177. +extern_dspfunc(put_no_rnd, 8);
  178. +extern_dspfunc(avg, 8);
  179. +extern_dspfunc(avg_no_rnd, 8);
  180. +#undef extern_dspfunc
  181. +
  182. +#ifdef CHECK_DSP_FUNCS_AGAINST_C
  183. +#define extern_dspfunc(PFX, NUM) \
  184. + void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  185. + void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  186. + void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
  187. + void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
  188. +
  189. +extern_dspfunc(put, 4);
  190. +extern_dspfunc(put_no_rnd, 4);
  191. +extern_dspfunc(put, 8);
  192. +extern_dspfunc(put_no_rnd, 8);
  193. +extern_dspfunc(put, 16);
  194. +extern_dspfunc(put_no_rnd, 16);
  195. +extern_dspfunc(avg, 8);
  196. +extern_dspfunc(avg_no_rnd, 8);
  197. +extern_dspfunc(avg, 16);
  198. +extern_dspfunc(avg_no_rnd, 16);
  199. +
  200. +
  201. +#undef extern_dspfunc
  202. +#define extern_dspfunc(PFX, NUM) \
  203. +void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \
  204. +void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \
  205. +void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \
  206. +void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \
  207. +void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \
  208. +void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \
  209. +void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \
  210. +void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \
  211. +void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \
  212. +void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \
  213. +void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \
  214. +void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \
  215. +void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \
  216. +void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \
  217. +void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \
  218. +void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \
  219. +
  220. +extern_dspfunc(put_h264_qpel, 16);
  221. +extern_dspfunc(put_h264_qpel, 8);
  222. +extern_dspfunc(put_h264_qpel, 4);
  223. +extern_dspfunc(avg_h264_qpel, 16);
  224. +extern_dspfunc(avg_h264_qpel, 8);
  225. +extern_dspfunc(avg_h264_qpel, 4);
  226. +
  227. +#undef extern_dspfunc
  228. +
  229. +void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  230. +void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  231. +void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  232. +
  233. +void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  234. +void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  235. +void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
  236. +
  237. +
  238. +void dump_block8(uint8_t *block, int line_size, int h);
  239. +void dump_block4(uint8_t *block, int line_size, int h);
  240. +void dump_block(uint8_t *block, int line_size, int h, int w);
  241. +
  242. +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  243. + int h, char *name, int max_dev);
  244. +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  245. + int h, char *name, int max_dev);
  246. +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  247. + int h, int width, char *name, int max_dev);
  248. +
  249. +#define PIXOP2( OPNAME, OP ) \
  250. +void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  251. + int i;\
  252. + for(i=0; i<h; i++){\
  253. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  254. + pixels+=line_size;\
  255. + block +=line_size;\
  256. + }\
  257. +}\
  258. +void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  259. + int src_stride1, int src_stride2, int h){\
  260. + int i;\
  261. + for(i=0; i<h; i++){\
  262. + uint32_t a,b;\
  263. + a= LD32(&src1[i*src_stride1 ]);\
  264. + b= LD32(&src2[i*src_stride2 ]);\
  265. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  266. + a= LD32(&src1[i*src_stride1+4]);\
  267. + b= LD32(&src2[i*src_stride2+4]);\
  268. + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
  269. + }\
  270. +}\
  271. +\
  272. +void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  273. + int src_stride1, int src_stride2, int h){\
  274. + int i;\
  275. + for(i=0; i<h; i++){\
  276. + uint32_t a,b;\
  277. + a= LD32(&src1[i*src_stride1 ]);\
  278. + b= LD32(&src2[i*src_stride2 ]);\
  279. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  280. + }\
  281. +}\
  282. +\
  283. +void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  284. + int src_stride1, int src_stride2, int h){\
  285. + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  286. + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  287. +}\
  288. +
  289. +#else
  290. +#define PIXOP2( OPNAME, OP ) \
  291. +static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  292. + int i;\
  293. + for(i=0; i<h; i++){\
  294. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  295. + pixels+=line_size;\
  296. + block +=line_size;\
  297. + }\
  298. +}\
  299. +static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  300. + int i;\
  301. + for(i=0; i<h; i++){\
  302. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  303. + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
  304. + pixels+=line_size;\
  305. + block +=line_size;\
  306. + }\
  307. +}\
  308. +static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  309. + int i;\
  310. + for(i=0; i<h; i++){\
  311. + OP(*((uint32_t*)(block )), LD32(pixels ));\
  312. + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
  313. + OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
  314. + OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
  315. + pixels+=line_size;\
  316. + block +=line_size;\
  317. + }\
  318. +}\
  319. +static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  320. + int src_stride1, int src_stride2, int h){\
  321. + int i;\
  322. + for(i=0; i<h; i++){\
  323. + uint32_t a,b;\
  324. + a= LD32(&src1[i*src_stride1 ]);\
  325. + b= LD32(&src2[i*src_stride2 ]);\
  326. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  327. + a= LD32(&src1[i*src_stride1+4]);\
  328. + b= LD32(&src2[i*src_stride2+4]);\
  329. + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
  330. + }\
  331. +}\
  332. +\
  333. +static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  334. + int src_stride1, int src_stride2, int h){\
  335. + int i;\
  336. + for(i=0; i<h; i++){\
  337. + uint32_t a,b;\
  338. + a= LD32(&src1[i*src_stride1 ]);\
  339. + b= LD32(&src2[i*src_stride2 ]);\
  340. + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
  341. + }\
  342. +}\
  343. +\
  344. +static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  345. + int src_stride1, int src_stride2, int h){\
  346. + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
  347. + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  348. +}\
  349. +
  350. +#endif
  351. +
  352. +#define op_avg(a, b) a = rnd_avg32(a, b)
  353. +#define op_put(a, b) a = b
  354. +
  355. +PIXOP2(avg, op_avg)
  356. +PIXOP2(put, op_put)
  357. +#undef op_avg
  358. +#undef op_put
  359. +
  360. +
  361. +
  362. +static void clear_blocks_avr32(DCTELEM *blocks)
  363. +{
  364. + int n = 12;
  365. + uint64_t tmp1, tmp2;
  366. + blocks += 6*64;
  367. + asm volatile ( "mov\t%1, 0\n"
  368. + "mov\t%m1, 0\n"
  369. + "mov\t%2, 0\n"
  370. + "mov\t%m2, 0\n"
  371. + "0:\n"
  372. + "stm\t--%3, %1, %m1, %2, %m2\n"
  373. + "stm\t--%3, %1, %m1, %2, %m2\n"
  374. + "stm\t--%3, %1, %m1, %2, %m2\n"
  375. + "stm\t--%3, %1, %m1, %2, %m2\n"
  376. + "sub\t%0, 1\n"
  377. + "brne\t0b\n"
  378. + : "+r"(n), "=&r"(tmp1), "=&r"(tmp2),
  379. + "+r"(blocks));
  380. +}
  381. +
  382. +
  383. +static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  384. + const int A=(8-x)*(8-y);
  385. + const int B=( x)*(8-y);
  386. + const int C=(8-x)*( y);
  387. + const int D=( x)*( y);
  388. + int i;
  389. +
  390. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  391. + PICO_PUT_W(PICO_COEFF0_B, 32);
  392. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  393. + PICO_PUT_W(PICO_COEFF1_B, 0);
  394. + PICO_PUT_W(PICO_COEFF2_A, 0);
  395. + PICO_PUT_W(PICO_COEFF2_B, 0);
  396. + PICO_PUT_W(PICO_CONFIG,
  397. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  398. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  399. + | PICO_COEFF_FRAC_BITS(6)
  400. + | PICO_OFFSET_FRAC_BITS(6));
  401. +
  402. + for(i=0; i<h; i++)
  403. + {
  404. +
  405. + int src0 = LD32(src);
  406. + int src1 = LD32(src + stride);
  407. +
  408. + PICO_MVRC_W(PICO_INPIX0, src0);
  409. + PICO_MVRC_W(PICO_INPIX1, src1);
  410. + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
  411. + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
  412. + src += stride;
  413. + ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
  414. + dst += stride;
  415. + }
  416. +}
  417. +
  418. +
  419. +static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  420. + const int A=(8-x)*(8-y);\
  421. + const int B=( x)*(8-y);
  422. + const int C=(8-x)*( y);
  423. + const int D=( x)*( y);
  424. + int i;
  425. +
  426. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  427. + PICO_PUT_W(PICO_COEFF0_B, 32);
  428. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  429. + PICO_PUT_W(PICO_COEFF1_B, 0);
  430. + PICO_PUT_W(PICO_COEFF2_A, 0);
  431. + PICO_PUT_W(PICO_COEFF2_B, 0);
  432. + PICO_PUT_W(PICO_CONFIG,
  433. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  434. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  435. + | PICO_COEFF_FRAC_BITS(6)
  436. + | PICO_OFFSET_FRAC_BITS(6));
  437. +
  438. + for(i=0; i<h; i++)
  439. + {
  440. + /*
  441. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  442. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  443. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  444. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  445. + dst+= stride;
  446. + src+= stride;
  447. + */
  448. +
  449. + int src0 = LD32(src);
  450. + int src1 = (((int)src[4] << 24) | (int)src[stride]);
  451. + int src2 = LD32(src + stride + 1);
  452. +
  453. + PICO_MVRC_W(PICO_INPIX0, src0);
  454. + PICO_MVRC_W(PICO_INPIX1, src1);
  455. + PICO_MVRC_W(PICO_INPIX2, src2);
  456. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  457. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  458. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  459. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  460. + src += stride;
  461. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  462. +
  463. + dst += stride;
  464. + }
  465. +}
  466. +
  467. +static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  468. + const int A=(8-x)*(8-y);
  469. + const int B=( x)*(8-y);
  470. + const int C=(8-x)*( y);
  471. + const int D=( x)*( y);
  472. + int i;
  473. +
  474. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  475. + PICO_PUT_W(PICO_COEFF0_B, 32);
  476. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  477. + PICO_PUT_W(PICO_COEFF1_B, 0);
  478. + PICO_PUT_W(PICO_COEFF2_A, 0);
  479. + PICO_PUT_W(PICO_COEFF2_B, 0);
  480. + PICO_PUT_W(PICO_CONFIG,
  481. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  482. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  483. + | PICO_COEFF_FRAC_BITS(6)
  484. + | PICO_OFFSET_FRAC_BITS(6));
  485. +
  486. + for(i=0; i<h; i++)
  487. + {
  488. + /*
  489. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  490. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  491. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  492. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  493. + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
  494. + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
  495. + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
  496. + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
  497. + dst+= stride;
  498. + src+= stride;
  499. + */
  500. + int src0 = LD32(src);
  501. + int src1 = (((int)src[4] << 24) | (int)src[stride]);
  502. + int src2 = LD32(src + stride + 1);
  503. +
  504. + PICO_MVRC_W(PICO_INPIX0, src0);
  505. + PICO_MVRC_W(PICO_INPIX1, src1);
  506. + PICO_MVRC_W(PICO_INPIX2, src2);
  507. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  508. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  509. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  510. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  511. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  512. +
  513. + src0 = LD32(src + 4);
  514. + src1 = (src[8] << 24) | src[stride + 4];
  515. + src2 = LD32(src + stride + 5);
  516. +
  517. + PICO_MVRC_W(PICO_INPIX0, src0);
  518. + PICO_MVRC_W(PICO_INPIX1, src1);
  519. + PICO_MVRC_W(PICO_INPIX2, src2);
  520. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  521. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  522. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  523. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  524. + src += stride;
  525. + ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
  526. +
  527. + dst += stride;
  528. + }
  529. +}
  530. +
  531. +
  532. +static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  533. + const int A=(8-x)*(8-y);
  534. + const int B=( x)*(8-y);
  535. + const int C=(8-x)*( y);
  536. + const int D=( x)*( y);
  537. + int i;
  538. +
  539. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  540. + PICO_PUT_W(PICO_COEFF0_B, 32);
  541. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  542. + PICO_PUT_W(PICO_COEFF1_B, 0);
  543. + PICO_PUT_W(PICO_COEFF2_A, 0);
  544. + PICO_PUT_W(PICO_COEFF2_B, 0);
  545. + PICO_PUT_W(PICO_CONFIG,
  546. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  547. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  548. + | PICO_COEFF_FRAC_BITS(6)
  549. + | PICO_OFFSET_FRAC_BITS(6));
  550. +
  551. + for(i=0; i<h; i++)
  552. + {
  553. + int src0 = LD32(src);
  554. + int src1 = LD32(src + stride);
  555. +
  556. + PICO_MVRC_W(PICO_INPIX0, src0);
  557. + PICO_MVRC_W(PICO_INPIX1, src1);
  558. + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
  559. + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
  560. + src += stride;
  561. + ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
  562. + dst += stride;
  563. + }
  564. +}
  565. +
  566. +
  567. +static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  568. + const int A=(8-x)*(8-y);\
  569. + const int B=( x)*(8-y);
  570. + const int C=(8-x)*( y);
  571. + const int D=( x)*( y);
  572. + int i;
  573. +
  574. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  575. + PICO_PUT_W(PICO_COEFF0_B, 32);
  576. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  577. + PICO_PUT_W(PICO_COEFF1_B, 0);
  578. + PICO_PUT_W(PICO_COEFF2_A, 0);
  579. + PICO_PUT_W(PICO_COEFF2_B, 0);
  580. + PICO_PUT_W(PICO_CONFIG,
  581. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  582. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  583. + | PICO_COEFF_FRAC_BITS(6)
  584. + | PICO_OFFSET_FRAC_BITS(6));
  585. +
  586. + for(i=0; i<h; i++)
  587. + {
  588. + /*
  589. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  590. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  591. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  592. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  593. + dst+= stride;
  594. + src+= stride;
  595. + */
  596. +
  597. + int src0 = *((int *)src);
  598. + int src1 = (int)((src[4] << 24) | src[stride]);
  599. + int src2 = *((int *)(src + stride + 1));
  600. +
  601. + PICO_MVRC_W(PICO_INPIX0, src0);
  602. + PICO_MVRC_W(PICO_INPIX1, src1);
  603. + PICO_MVRC_W(PICO_INPIX2, src2);
  604. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  605. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  606. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  607. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  608. + src += stride;
  609. + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
  610. + dst += stride;
  611. + }
  612. +}
  613. +
  614. +static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
  615. + const int A=(8-x)*(8-y);
  616. + const int B=( x)*(8-y);
  617. + const int C=(8-x)*( y);
  618. + const int D=( x)*( y);
  619. + int i;
  620. +
  621. + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
  622. + PICO_PUT_W(PICO_COEFF0_B, 32);
  623. + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
  624. + PICO_PUT_W(PICO_COEFF1_B, 0);
  625. + PICO_PUT_W(PICO_COEFF2_A, 0);
  626. + PICO_PUT_W(PICO_COEFF2_B, 0);
  627. + PICO_PUT_W(PICO_CONFIG,
  628. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  629. + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
  630. + | PICO_COEFF_FRAC_BITS(6)
  631. + | PICO_OFFSET_FRAC_BITS(6));
  632. +
  633. + for(i=0; i<h; i++)
  634. + {
  635. + /*
  636. + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
  637. + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
  638. + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
  639. + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
  640. + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
  641. + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
  642. + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
  643. + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
  644. + dst+= stride;
  645. + src+= stride;
  646. + */
  647. + int src0 = *((int *)src);
  648. + int src1 = (volatile int)((src[4] << 24) | src[stride]);
  649. + int src2 = *((int *)(src + stride + 1));
  650. +
  651. + PICO_MVRC_W(PICO_INPIX0, src0);
  652. + PICO_MVRC_W(PICO_INPIX1, src1);
  653. + PICO_MVRC_W(PICO_INPIX2, src2);
  654. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  655. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  656. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  657. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  658. + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
  659. +
  660. + src0 = *((int *)(src + 4));
  661. + src1 = (int)((src[8] << 24) | src[stride + 4]);
  662. + src2 = *((int *)(src + stride + 5));
  663. +
  664. + PICO_MVRC_W(PICO_INPIX0, src0);
  665. + PICO_MVRC_W(PICO_INPIX1, src1);
  666. + PICO_MVRC_W(PICO_INPIX2, src2);
  667. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
  668. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
  669. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
  670. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
  671. + src += stride;
  672. + ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
  673. + dst += stride;
  674. + }
  675. +}
  676. +
  677. +static struct pico_config_t h264_qpel4_h_lowpass_config = {
  678. + .input_mode = PICO_HOR_FILTER_MODE,
  679. + .output_mode = PICO_PLANAR_MODE,
  680. + .coeff_frac_bits = 5,
  681. + .offset_frac_bits = 5,
  682. + .coeff0_0 = 1,
  683. + .coeff0_1 = -5,
  684. + .coeff0_2 = 20,
  685. + .coeff0_3 = 16,
  686. + .coeff1_0 = 20,
  687. + .coeff1_1 = -5,
  688. + .coeff1_2 = 1,
  689. + .coeff1_3 = 0,
  690. + .coeff2_0 = 0,
  691. + .coeff2_1 = 0,
  692. + .coeff2_2 = 0,
  693. + .coeff2_3 = 0
  694. +};
  695. +
  696. +
  697. +
  698. +static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  699. + const int h=4;
  700. + int i;
  701. +
  702. + set_pico_config(&h264_qpel4_h_lowpass_config);
  703. +
  704. + for(i=0; i<h; i++){
  705. +
  706. + /*
  707. + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
  708. + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
  709. + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
  710. + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
  711. + dst+=dstStride;\
  712. + src+=srcStride;\ */
  713. + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
  714. + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
  715. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  716. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  717. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  718. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  719. + src += srcStride;
  720. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  721. + dst += dstStride;
  722. + }
  723. +}
  724. +
  725. +static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  726. + const int h=4;
  727. + int i;
  728. +
  729. + set_pico_config(&h264_qpel4_h_lowpass_config);
  730. +
  731. + for(i=0; i<h; i++){
  732. +
  733. + /*
  734. + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
  735. + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
  736. + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
  737. + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
  738. + dst+=dstStride;\
  739. + src+=srcStride;\ */
  740. +
  741. + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
  742. + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
  743. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  744. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  745. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  746. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  747. + src += srcStride;
  748. + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
  749. + dst += dstStride;
  750. + }
  751. +}
  752. +
  753. +static struct pico_config_t h264_qpel4_v_lowpass_config1 = {
  754. + .input_mode = PICO_VERT_FILTER_MODE,
  755. + .output_mode = PICO_PACKED_MODE,
  756. + .coeff_frac_bits = 5,
  757. + .offset_frac_bits = 5,
  758. + .coeff0_0 = 1,
  759. + .coeff0_1 = -5,
  760. + .coeff0_2 = 20,
  761. + .coeff0_3 = 16,
  762. + .coeff1_0 = 1,
  763. + .coeff1_1 = -5,
  764. + .coeff1_2 = 20,
  765. + .coeff1_3 = 16,
  766. + .coeff2_0 = 1,
  767. + .coeff2_1 = -5,
  768. + .coeff2_2 = 20,
  769. + .coeff2_3 = 16
  770. +};
  771. +
  772. +
  773. +
  774. +static struct pico_config_t h264_qpel4_v_lowpass_config2 = {
  775. + .input_mode = PICO_VERT_FILTER_MODE,
  776. + .output_mode = PICO_PLANAR_MODE,
  777. + .coeff_frac_bits = 5,
  778. + .offset_frac_bits = 5,
  779. + .coeff0_0 = 1,
  780. + .coeff0_1 = -5,
  781. + .coeff0_2 = 20,
  782. + .coeff0_3 = 16,
  783. + .coeff1_0 = 20,
  784. + .coeff1_1 = -5,
  785. + .coeff1_2 = 1,
  786. + .coeff1_3 = 0,
  787. + .coeff2_0 = 0,
  788. + .coeff2_1 = 0,
  789. + .coeff2_2 = 0,
  790. + .coeff2_3 = 0
  791. +};
  792. +
  793. +static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  794. +
  795. + /*
  796. + const int w=4;
  797. + uint8_t *cm = cropTbl + MAX_NEG_CROP;
  798. + int i;
  799. + for(i=0; i<w; i++)
  800. + {
  801. + const int srcB= src[-2*srcStride];\
  802. + const int srcA= src[-1*srcStride];\
  803. + const int src0= src[0 *srcStride];\
  804. + const int src1= src[1 *srcStride];\
  805. + const int src2= src[2 *srcStride];\
  806. + const int src3= src[3 *srcStride];\
  807. + const int src4= src[4 *srcStride];\
  808. + const int src5= src[5 *srcStride];\
  809. + const int src6= src[6 *srcStride];\
  810. + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  811. + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  812. + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  813. + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  814. + dst++;\
  815. + src++;\
  816. + */
  817. +
  818. + set_pico_config(&h264_qpel4_v_lowpass_config1);
  819. +
  820. + {
  821. + int srcB= LD32(src - 2*srcStride);
  822. + int srcA= LD32(src - 1*srcStride);
  823. + int src0= LD32(src + 0 *srcStride);
  824. + int src1= LD32(src + 1 *srcStride);
  825. + int src2= LD32(src + 2 *srcStride);
  826. + int src3= LD32(src + 3 *srcStride);
  827. + int src4= LD32(src + 4 *srcStride);
  828. + int src5= LD32(src + 5 *srcStride);
  829. + int src6= LD32(src + 6 *srcStride);
  830. +
  831. + union wordbytes {
  832. + int word;
  833. + struct {
  834. + unsigned int t:8;
  835. + unsigned int u:8;
  836. + unsigned int l:8;
  837. + unsigned int b:8;
  838. + } bytes;
  839. + } tmp1, tmp2, tmp3;
  840. +
  841. + /* First compute the leftmost three colums */
  842. + PICO_MVRC_W(PICO_INPIX0, srcB);
  843. + PICO_MVRC_W(PICO_INPIX1, srcA);
  844. + PICO_MVRC_W(PICO_INPIX2, src0);
  845. + PICO_OP(0, 0, 0, 3, 6);
  846. + PICO_MVRC_W(PICO_INPIX2, src1);
  847. + PICO_MVRC_W(PICO_INPIX1, src2);
  848. + PICO_MVRC_W(PICO_INPIX0, src3);
  849. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  850. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  851. + dst += dstStride;
  852. + PICO_MVRC_W(PICO_INPIX0, srcA);
  853. + PICO_MVRC_W(PICO_INPIX1, src0);
  854. + PICO_MVRC_W(PICO_INPIX2, src1);
  855. + PICO_OP(0, 0, 0, 3, 6);
  856. + PICO_MVRC_W(PICO_INPIX2, src2);
  857. + PICO_MVRC_W(PICO_INPIX1, src3);
  858. + PICO_MVRC_W(PICO_INPIX0, src4);
  859. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  860. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  861. + dst += dstStride;
  862. + PICO_MVRC_W(PICO_INPIX0, src0);
  863. + PICO_MVRC_W(PICO_INPIX1, src1);
  864. + PICO_MVRC_W(PICO_INPIX2, src2);
  865. + PICO_OP(0, 0, 0, 3, 6);
  866. + PICO_MVRC_W(PICO_INPIX2, src3);
  867. + PICO_MVRC_W(PICO_INPIX1, src4);
  868. + PICO_MVRC_W(PICO_INPIX0, src5);
  869. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  870. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  871. + dst += dstStride;
  872. + PICO_MVRC_W(PICO_INPIX0, src1);
  873. + PICO_MVRC_W(PICO_INPIX1, src2);
  874. + PICO_MVRC_W(PICO_INPIX2, src3);
  875. + PICO_OP(0, 0, 0, 3, 6);
  876. + PICO_MVRC_W(PICO_INPIX2, src4);
  877. + PICO_MVRC_W(PICO_INPIX1, src5);
  878. + PICO_MVRC_W(PICO_INPIX0, src6);
  879. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  880. + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
  881. + /* Now compute the last column */
  882. +
  883. + tmp1.bytes.t = srcB;
  884. + tmp1.bytes.u = src1;
  885. + tmp1.bytes.l = src4;
  886. +
  887. + tmp2.bytes.t = srcA;
  888. + tmp2.bytes.u = src2;
  889. + tmp2.bytes.l = src5;
  890. +
  891. + tmp3.bytes.t = src0;
  892. + tmp3.bytes.u = src3;
  893. + tmp3.bytes.l = src6;
  894. +
  895. + PICO_MVRC_W(PICO_INPIX0, tmp1.word);
  896. + PICO_MVRC_W(PICO_INPIX1, tmp2.word);
  897. + PICO_MVRC_W(PICO_INPIX2, tmp3.word);
  898. + set_pico_config(&h264_qpel4_v_lowpass_config2);
  899. +
  900. +
  901. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  902. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  903. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  904. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  905. +
  906. + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
  907. + dst[3] = (char)(tmp1.bytes.b);
  908. + dst[3 - dstStride] = (char)(tmp1.bytes.l);
  909. + dst[3 - 2*dstStride] = (char)(tmp1.bytes.u);
  910. + dst[3 - 3*dstStride] = (char)(tmp1.bytes.t);
  911. +
  912. + }
  913. + /*}
  914. +
  915. +
  916. + }*/
  917. +}
  918. +
  919. +static void avg_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  920. +
  921. + /*
  922. + const int w=4;
  923. + uint8_t *cm = cropTbl + MAX_NEG_CROP;
  924. + int i;
  925. + for(i=0; i<w; i++)
  926. + {
  927. + const int srcB= src[-2*srcStride];\
  928. + const int srcA= src[-1*srcStride];\
  929. + const int src0= src[0 *srcStride];\
  930. + const int src1= src[1 *srcStride];\
  931. + const int src2= src[2 *srcStride];\
  932. + const int src3= src[3 *srcStride];\
  933. + const int src4= src[4 *srcStride];\
  934. + const int src5= src[5 *srcStride];\
  935. + const int src6= src[6 *srcStride];\
  936. + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  937. + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  938. + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  939. + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  940. + dst++;\
  941. + src++;\
  942. + */
  943. + uint8_t tmp_block[4*4];
  944. +
  945. + set_pico_config(&h264_qpel4_v_lowpass_config1);
  946. +
  947. + {
  948. + int srcB= LD32(src - 2*srcStride);
  949. + int srcA= LD32(src - 1*srcStride);
  950. + int src0= LD32(src + 0 *srcStride);
  951. + int src1= LD32(src + 1 *srcStride);
  952. + int src2= LD32(src + 2 *srcStride);
  953. + int src3= LD32(src + 3 *srcStride);
  954. + int src4= LD32(src + 4 *srcStride);
  955. + int src5= LD32(src + 5 *srcStride);
  956. + int src6= LD32(src + 6 *srcStride);
  957. +
  958. + /* First compute the leftmost three colums */
  959. + PICO_MVRC_W(PICO_INPIX0, srcB);
  960. + PICO_MVRC_W(PICO_INPIX1, srcA);
  961. + PICO_MVRC_W(PICO_INPIX2, src0);
  962. + PICO_OP(0, 0, 0, 3, 6);
  963. + PICO_MVRC_W(PICO_INPIX2, src1);
  964. + PICO_MVRC_W(PICO_INPIX1, src2);
  965. + PICO_MVRC_W(PICO_INPIX0, src3);
  966. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  967. + ST32(tmp_block, PICO_GET_W(PICO_OUTPIX0));
  968. + PICO_MVRC_W(PICO_INPIX0, srcA);
  969. + PICO_MVRC_W(PICO_INPIX1, src0);
  970. + PICO_MVRC_W(PICO_INPIX2, src1);
  971. + PICO_OP(0, 0, 0, 3, 6);
  972. + PICO_MVRC_W(PICO_INPIX2, src2);
  973. + PICO_MVRC_W(PICO_INPIX1, src3);
  974. + PICO_MVRC_W(PICO_INPIX0, src4);
  975. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  976. + ST32(tmp_block + 4, PICO_GET_W(PICO_OUTPIX0));
  977. + PICO_MVRC_W(PICO_INPIX0, src0);
  978. + PICO_MVRC_W(PICO_INPIX1, src1);
  979. + PICO_MVRC_W(PICO_INPIX2, src2);
  980. + PICO_OP(0, 0, 0, 3, 6);
  981. + PICO_MVRC_W(PICO_INPIX2, src3);
  982. + PICO_MVRC_W(PICO_INPIX1, src4);
  983. + PICO_MVRC_W(PICO_INPIX0, src5);
  984. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  985. + ST32(tmp_block + 8, PICO_GET_W(PICO_OUTPIX0));
  986. + PICO_MVRC_W(PICO_INPIX0, src1);
  987. + PICO_MVRC_W(PICO_INPIX1, src2);
  988. + PICO_MVRC_W(PICO_INPIX2, src3);
  989. + PICO_OP(0, 0, 0, 3, 6);
  990. + PICO_MVRC_W(PICO_INPIX2, src4);
  991. + PICO_MVRC_W(PICO_INPIX1, src5);
  992. + PICO_MVRC_W(PICO_INPIX0, src6);
  993. + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
  994. + ST32(tmp_block + 12, PICO_GET_W(PICO_OUTPIX0));
  995. + /* Now compute the last column */
  996. +
  997. + union wordbytes {
  998. + int word;
  999. + struct {
  1000. + unsigned int t:8;
  1001. + unsigned int u:8;
  1002. + unsigned int l:8;
  1003. + unsigned int b:8;
  1004. + } bytes; } tmp1, tmp2, tmp3;
  1005. +
  1006. +
  1007. + tmp1.bytes.t = srcB;
  1008. + tmp1.bytes.u = src1;
  1009. + tmp1.bytes.l = src4;
  1010. +
  1011. + tmp2.bytes.t = srcA;
  1012. + tmp2.bytes.u = src2;
  1013. + tmp2.bytes.l = src5;
  1014. +
  1015. + tmp3.bytes.t = src0;
  1016. + tmp3.bytes.u = src3;
  1017. + tmp3.bytes.l = src6;
  1018. +
  1019. + PICO_MVRC_W(PICO_INPIX0, tmp1.word);
  1020. + PICO_MVRC_W(PICO_INPIX1, tmp2.word);
  1021. + PICO_MVRC_W(PICO_INPIX2, tmp3.word);
  1022. + set_pico_config(&h264_qpel4_v_lowpass_config2);
  1023. +
  1024. +
  1025. + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
  1026. + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
  1027. + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
  1028. + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
  1029. +
  1030. + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
  1031. + tmp_block[3 + 3*4] = (char)(tmp1.bytes.b);
  1032. + tmp_block[3 + 2*4] = (char)(tmp1.bytes.l);
  1033. + tmp_block[3 + 1*4] = (char)(tmp1.bytes.u);
  1034. + tmp_block[3] = (char)(tmp1.bytes.t);
  1035. +
  1036. + /* Compute the average */
  1037. + srcB= LD32(dst);
  1038. + srcA= LD32(dst + dstStride);
  1039. + src0= LD32(dst + dstStride*2);
  1040. + src1= LD32(dst + dstStride*3);
  1041. +
  1042. + src2= LD32(tmp_block);
  1043. + src3= LD32(tmp_block + 4);
  1044. + src4= LD32(tmp_block + 8);
  1045. + src5= LD32(tmp_block + 12);
  1046. +
  1047. + ST32(dst, rnd_avg32(srcB, src2));
  1048. + ST32(dst + dstStride, rnd_avg32(srcA, src3));
  1049. + ST32(dst + 2*dstStride, rnd_avg32(src0, src4));
  1050. + ST32(dst + 3*dstStride, rnd_avg32(src1, src5));
  1051. + }
  1052. +}
  1053. +
  1054. +static struct pico_config_t h264_qpel4_hv_lowpass_config = {
  1055. + .input_mode = PICO_HOR_FILTER_MODE,
  1056. + .output_mode = PICO_PACKED_MODE,
  1057. + .coeff_frac_bits = 10,
  1058. + .offset_frac_bits = 10,
  1059. + .coeff0_0 = 1,
  1060. + .coeff0_1 = -5,
  1061. + .coeff0_2 = 20,
  1062. + .coeff0_3 = 512,
  1063. + .coeff1_0 = -5,
  1064. + .coeff1_1 = 25,
  1065. + .coeff1_2 = -100,
  1066. + .coeff1_3 = 0,
  1067. + .coeff2_0 = 20,
  1068. + .coeff2_1 = -100,
  1069. + .coeff2_2 = 400,
  1070. + .coeff2_3 = 0
  1071. +};
  1072. +
  1073. +static void put_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1074. +
  1075. + int32_t tmp_block[48];
  1076. + int32_t *tmp = tmp_block;
  1077. + int i;
  1078. +
  1079. + set_pico_config(&h264_qpel4_hv_lowpass_config);
  1080. +
  1081. + src -= 2;
  1082. + for ( i = 0; i < 2; i++ ){
  1083. + int srcB= LD32(src - 2*srcStride);
  1084. + int srcA= LD32(src - 1*srcStride);
  1085. + int src0= LD32(src + 0 *srcStride);
  1086. + int src1= LD32(src + 1 *srcStride);
  1087. + int src2= LD32(src + 2 *srcStride);
  1088. + int src3= LD32(src + 3 *srcStride);
  1089. + int src4= LD32(src + 4 *srcStride);
  1090. + int src5= LD32(src + 5 *srcStride);
  1091. + int src6= LD32(src + 6 *srcStride);
  1092. +
  1093. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1094. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1095. + PICO_MVRC_W(PICO_INPIX2, src0);
  1096. + PICO_OP(0, 0, 0, 4, 8);
  1097. + PICO_MVRC_W(PICO_INPIX2, src1);
  1098. + PICO_MVRC_W(PICO_INPIX1, src2);
  1099. + PICO_MVRC_W(PICO_INPIX0, src3);
  1100. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1101. + PICO_STCM_W(tmp,
  1102. + PICO_REGVECT_VMU0_OUT,
  1103. + PICO_REGVECT_VMU1_OUT,
  1104. + PICO_REGVECT_VMU2_OUT);
  1105. + tmp += 3;
  1106. +
  1107. + PICO_OP(0, 0, 1, 5, 9);
  1108. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1109. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1110. + PICO_MVRC_W(PICO_INPIX2, src0);
  1111. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1112. + PICO_STCM_W(tmp,
  1113. + PICO_REGVECT_VMU0_OUT,
  1114. + PICO_REGVECT_VMU1_OUT,
  1115. + PICO_REGVECT_VMU2_OUT);
  1116. + tmp += 3;
  1117. +
  1118. + PICO_MVRC_W(PICO_INPIX0, src1);
  1119. + PICO_OP(0, 0, 4, 8, 0);
  1120. + PICO_MVRC_W(PICO_INPIX2, src2);
  1121. + PICO_MVRC_W(PICO_INPIX1, src3);
  1122. + PICO_MVRC_W(PICO_INPIX0, src4);
  1123. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1124. + PICO_STCM_W(tmp,
  1125. + PICO_REGVECT_VMU0_OUT,
  1126. + PICO_REGVECT_VMU1_OUT,
  1127. + PICO_REGVECT_VMU2_OUT);
  1128. + tmp += 3;
  1129. +
  1130. + PICO_OP(0, 0, 1, 5, 9);
  1131. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1132. + PICO_MVRC_W(PICO_INPIX1, src0);
  1133. + PICO_MVRC_W(PICO_INPIX2, src1);
  1134. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1135. + PICO_STCM_W(tmp,
  1136. + PICO_REGVECT_VMU0_OUT,
  1137. + PICO_REGVECT_VMU1_OUT,
  1138. + PICO_REGVECT_VMU2_OUT);
  1139. + tmp += 3;
  1140. +
  1141. + PICO_MVRC_W(PICO_INPIX0, src2);
  1142. + PICO_OP(0, 0, 4, 8, 0);
  1143. + PICO_MVRC_W(PICO_INPIX2, src3);
  1144. + PICO_MVRC_W(PICO_INPIX1, src4);
  1145. + PICO_MVRC_W(PICO_INPIX0, src5);
  1146. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1147. + PICO_STCM_W(tmp,
  1148. + PICO_REGVECT_VMU0_OUT,
  1149. + PICO_REGVECT_VMU1_OUT,
  1150. + PICO_REGVECT_VMU2_OUT);
  1151. + tmp += 3;
  1152. +
  1153. + PICO_OP(0, 0, 1, 5, 9);
  1154. + PICO_MVRC_W(PICO_INPIX0, src0);
  1155. + PICO_MVRC_W(PICO_INPIX1, src1);
  1156. + PICO_MVRC_W(PICO_INPIX2, src2);
  1157. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1158. + PICO_STCM_W(tmp,
  1159. + PICO_REGVECT_VMU0_OUT,
  1160. + PICO_REGVECT_VMU1_OUT,
  1161. + PICO_REGVECT_VMU2_OUT);
  1162. + tmp += 3;
  1163. +
  1164. + PICO_MVRC_W(PICO_INPIX0, src3);
  1165. + PICO_OP(0, 0, 4, 8, 0);
  1166. + PICO_MVRC_W(PICO_INPIX2, src4);
  1167. + PICO_MVRC_W(PICO_INPIX1, src5);
  1168. + PICO_MVRC_W(PICO_INPIX0, src6);
  1169. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1170. + PICO_STCM_W(tmp,
  1171. + PICO_REGVECT_VMU0_OUT,
  1172. + PICO_REGVECT_VMU1_OUT,
  1173. + PICO_REGVECT_VMU2_OUT);
  1174. + tmp += 3;
  1175. +
  1176. + PICO_OP(0, 0, 1, 5, 9);
  1177. + PICO_MVRC_W(PICO_INPIX0, src1);
  1178. + PICO_MVRC_W(PICO_INPIX1, src2);
  1179. + PICO_MVRC_W(PICO_INPIX2, src3);
  1180. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1181. + PICO_STCM_W(tmp,
  1182. + PICO_REGVECT_VMU0_OUT,
  1183. + PICO_REGVECT_VMU1_OUT,
  1184. + PICO_REGVECT_VMU2_OUT);
  1185. + tmp += 3;
  1186. + src += 2;
  1187. + }
  1188. +
  1189. + src -= 1;
  1190. + tmp -= 48;
  1191. +
  1192. +
  1193. + PICO_PUT_W(PICO_CONFIG,
  1194. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  1195. + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
  1196. + | PICO_COEFF_FRAC_BITS(10)
  1197. + | PICO_OFFSET_FRAC_BITS(10));
  1198. +
  1199. + for ( i = 0; i < 2; i++ ){
  1200. + int srcB= LD32(src - 2*srcStride);
  1201. + int srcA= LD32(src - 1*srcStride);
  1202. + int src0= LD32(src + 0 *srcStride);
  1203. + int src1= LD32(src + 1 *srcStride);
  1204. + int src2= LD32(src + 2 *srcStride);
  1205. + int src3= LD32(src + 3 *srcStride);
  1206. + int src4= LD32(src + 4 *srcStride);
  1207. + int src5= LD32(src + 5 *srcStride);
  1208. + int src6= LD32(src + 6 *srcStride);
  1209. +
  1210. +
  1211. + PICO_LDCM_W_INC(tmp,
  1212. + PICO_REGVECT_VMU0_OUT,
  1213. + PICO_REGVECT_VMU1_OUT,
  1214. + PICO_REGVECT_VMU2_OUT);
  1215. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1216. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1217. + PICO_MVRC_W(PICO_INPIX2, src0);
  1218. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1219. + PICO_MVRC_W(PICO_INPIX2, src1);
  1220. + PICO_MVRC_W(PICO_INPIX1, src2);
  1221. + PICO_MVRC_W(PICO_INPIX0, src3);
  1222. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1223. +
  1224. + PICO_LDCM_W_INC(tmp,
  1225. + PICO_REGVECT_VMU0_OUT,
  1226. + PICO_REGVECT_VMU1_OUT,
  1227. + PICO_REGVECT_VMU2_OUT);
  1228. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1229. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1230. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1231. + PICO_MVRC_W(PICO_INPIX2, src0);
  1232. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1233. +
  1234. + PICO_LDCM_W_INC(tmp,
  1235. + PICO_REGVECT_VMU0_OUT,
  1236. + PICO_REGVECT_VMU1_OUT,
  1237. + PICO_REGVECT_VMU2_OUT);
  1238. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1239. + PICO_MVRC_W(PICO_INPIX1, src0);
  1240. + PICO_MVRC_W(PICO_INPIX2, src1);
  1241. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1242. + PICO_MVRC_W(PICO_INPIX2, src2);
  1243. + PICO_MVRC_W(PICO_INPIX1, src3);
  1244. + PICO_MVRC_W(PICO_INPIX0, src4);
  1245. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1246. +
  1247. + PICO_LDCM_W_INC(tmp,
  1248. + PICO_REGVECT_VMU0_OUT,
  1249. + PICO_REGVECT_VMU1_OUT,
  1250. + PICO_REGVECT_VMU2_OUT);
  1251. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1252. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1253. + PICO_MVRC_W(PICO_INPIX1, src0);
  1254. + PICO_MVRC_W(PICO_INPIX2, src1);
  1255. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1256. +
  1257. + ST16(dst + 0*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
  1258. + ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
  1259. +
  1260. +
  1261. + PICO_LDCM_W_INC(tmp,
  1262. + PICO_REGVECT_VMU0_OUT,
  1263. + PICO_REGVECT_VMU1_OUT,
  1264. + PICO_REGVECT_VMU2_OUT);
  1265. + PICO_MVRC_W(PICO_INPIX0, src0);
  1266. + PICO_MVRC_W(PICO_INPIX1, src1);
  1267. + PICO_MVRC_W(PICO_INPIX2, src2);
  1268. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1269. + PICO_MVRC_W(PICO_INPIX2, src3);
  1270. + PICO_MVRC_W(PICO_INPIX1, src4);
  1271. + PICO_MVRC_W(PICO_INPIX0, src5);
  1272. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1273. +
  1274. + PICO_LDCM_W_INC(tmp,
  1275. + PICO_REGVECT_VMU0_OUT,
  1276. + PICO_REGVECT_VMU1_OUT,
  1277. + PICO_REGVECT_VMU2_OUT);
  1278. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1279. + PICO_MVRC_W(PICO_INPIX0, src0);
  1280. + PICO_MVRC_W(PICO_INPIX1, src1);
  1281. + PICO_MVRC_W(PICO_INPIX2, src2);
  1282. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1283. +
  1284. + PICO_LDCM_W_INC(tmp,
  1285. + PICO_REGVECT_VMU0_OUT,
  1286. + PICO_REGVECT_VMU1_OUT,
  1287. + PICO_REGVECT_VMU2_OUT);
  1288. + PICO_MVRC_W(PICO_INPIX0, src1);
  1289. + PICO_MVRC_W(PICO_INPIX1, src2);
  1290. + PICO_MVRC_W(PICO_INPIX2, src3);
  1291. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1292. + PICO_MVRC_W(PICO_INPIX2, src4);
  1293. + PICO_MVRC_W(PICO_INPIX1, src5);
  1294. + PICO_MVRC_W(PICO_INPIX0, src6);
  1295. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1296. +
  1297. + PICO_LDCM_W_INC(tmp,
  1298. + PICO_REGVECT_VMU0_OUT,
  1299. + PICO_REGVECT_VMU1_OUT,
  1300. + PICO_REGVECT_VMU2_OUT);
  1301. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1302. + PICO_MVRC_W(PICO_INPIX0, src1);
  1303. + PICO_MVRC_W(PICO_INPIX1, src2);
  1304. + PICO_MVRC_W(PICO_INPIX2, src3);
  1305. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1306. +
  1307. + ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
  1308. + ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
  1309. +
  1310. + dst += 2;
  1311. + src += 2;
  1312. + }
  1313. +}
  1314. +
  1315. +
  1316. +
  1317. +
  1318. +static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1319. +
  1320. + int32_t tmp_block[48];
  1321. + int32_t *tmp = tmp_block;
  1322. + int i;
  1323. +
  1324. + set_pico_config(&h264_qpel4_hv_lowpass_config);
  1325. +
  1326. + src -= 2;
  1327. + for ( i = 0; i < 2; i++ ){
  1328. + int srcB= LD32(src - 2*srcStride);
  1329. + int srcA= LD32(src - 1*srcStride);
  1330. + int src0= LD32(src + 0 *srcStride);
  1331. + int src1= LD32(src + 1 *srcStride);
  1332. + int src2= LD32(src + 2 *srcStride);
  1333. + int src3= LD32(src + 3 *srcStride);
  1334. + int src4= LD32(src + 4 *srcStride);
  1335. + int src5= LD32(src + 5 *srcStride);
  1336. + int src6= LD32(src + 6 *srcStride);
  1337. +
  1338. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1339. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1340. + PICO_MVRC_W(PICO_INPIX2, src0);
  1341. + PICO_OP(0, 0, 0, 4, 8);
  1342. + PICO_MVRC_W(PICO_INPIX2, src1);
  1343. + PICO_MVRC_W(PICO_INPIX1, src2);
  1344. + PICO_MVRC_W(PICO_INPIX0, src3);
  1345. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1346. + PICO_STCM_W(tmp,
  1347. + PICO_REGVECT_VMU0_OUT,
  1348. + PICO_REGVECT_VMU1_OUT,
  1349. + PICO_REGVECT_VMU2_OUT);
  1350. + tmp += 3;
  1351. +
  1352. + PICO_OP(0, 0, 1, 5, 9);
  1353. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1354. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1355. + PICO_MVRC_W(PICO_INPIX2, src0);
  1356. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1357. + PICO_STCM_W(tmp,
  1358. + PICO_REGVECT_VMU0_OUT,
  1359. + PICO_REGVECT_VMU1_OUT,
  1360. + PICO_REGVECT_VMU2_OUT);
  1361. + tmp += 3;
  1362. +
  1363. + PICO_MVRC_W(PICO_INPIX0, src1);
  1364. + PICO_OP(0, 0, 4, 8, 0);
  1365. + PICO_MVRC_W(PICO_INPIX2, src2);
  1366. + PICO_MVRC_W(PICO_INPIX1, src3);
  1367. + PICO_MVRC_W(PICO_INPIX0, src4);
  1368. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1369. + PICO_STCM_W(tmp,
  1370. + PICO_REGVECT_VMU0_OUT,
  1371. + PICO_REGVECT_VMU1_OUT,
  1372. + PICO_REGVECT_VMU2_OUT);
  1373. + tmp += 3;
  1374. +
  1375. + PICO_OP(0, 0, 1, 5, 9);
  1376. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1377. + PICO_MVRC_W(PICO_INPIX1, src0);
  1378. + PICO_MVRC_W(PICO_INPIX2, src1);
  1379. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1380. + PICO_STCM_W(tmp,
  1381. + PICO_REGVECT_VMU0_OUT,
  1382. + PICO_REGVECT_VMU1_OUT,
  1383. + PICO_REGVECT_VMU2_OUT);
  1384. + tmp += 3;
  1385. +
  1386. + PICO_MVRC_W(PICO_INPIX0, src2);
  1387. + PICO_OP(0, 0, 4, 8, 0);
  1388. + PICO_MVRC_W(PICO_INPIX2, src3);
  1389. + PICO_MVRC_W(PICO_INPIX1, src4);
  1390. + PICO_MVRC_W(PICO_INPIX0, src5);
  1391. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1392. + PICO_STCM_W(tmp,
  1393. + PICO_REGVECT_VMU0_OUT,
  1394. + PICO_REGVECT_VMU1_OUT,
  1395. + PICO_REGVECT_VMU2_OUT);
  1396. + tmp += 3;
  1397. +
  1398. + PICO_OP(0, 0, 1, 5, 9);
  1399. + PICO_MVRC_W(PICO_INPIX0, src0);
  1400. + PICO_MVRC_W(PICO_INPIX1, src1);
  1401. + PICO_MVRC_W(PICO_INPIX2, src2);
  1402. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1403. + PICO_STCM_W(tmp,
  1404. + PICO_REGVECT_VMU0_OUT,
  1405. + PICO_REGVECT_VMU1_OUT,
  1406. + PICO_REGVECT_VMU2_OUT);
  1407. + tmp += 3;
  1408. +
  1409. + PICO_MVRC_W(PICO_INPIX0, src3);
  1410. + PICO_OP(0, 0, 4, 8, 0);
  1411. + PICO_MVRC_W(PICO_INPIX2, src4);
  1412. + PICO_MVRC_W(PICO_INPIX1, src5);
  1413. + PICO_MVRC_W(PICO_INPIX0, src6);
  1414. + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
  1415. + PICO_STCM_W(tmp,
  1416. + PICO_REGVECT_VMU0_OUT,
  1417. + PICO_REGVECT_VMU1_OUT,
  1418. + PICO_REGVECT_VMU2_OUT);
  1419. + tmp += 3;
  1420. +
  1421. + PICO_OP(0, 0, 1, 5, 9);
  1422. + PICO_MVRC_W(PICO_INPIX0, src1);
  1423. + PICO_MVRC_W(PICO_INPIX1, src2);
  1424. + PICO_MVRC_W(PICO_INPIX2, src3);
  1425. + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
  1426. + PICO_STCM_W(tmp,
  1427. + PICO_REGVECT_VMU0_OUT,
  1428. + PICO_REGVECT_VMU1_OUT,
  1429. + PICO_REGVECT_VMU2_OUT);
  1430. + tmp += 3;
  1431. + src += 2;
  1432. + }
  1433. +
  1434. + src -= 1;
  1435. + tmp -= 48;
  1436. +
  1437. +
  1438. + PICO_PUT_W(PICO_CONFIG,
  1439. + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
  1440. + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
  1441. + | PICO_COEFF_FRAC_BITS(10)
  1442. + | PICO_OFFSET_FRAC_BITS(10));
  1443. +
  1444. + for ( i = 0; i < 2; i++ ){
  1445. + int srcB= LD32(src - 2*srcStride);
  1446. + int srcA= LD32(src - 1*srcStride);
  1447. + int src0= LD32(src + 0 *srcStride);
  1448. + int src1= LD32(src + 1 *srcStride);
  1449. + int src2= LD32(src + 2 *srcStride);
  1450. + int src3= LD32(src + 3 *srcStride);
  1451. + int src4= LD32(src + 4 *srcStride);
  1452. + int src5= LD32(src + 5 *srcStride);
  1453. + int src6= LD32(src + 6 *srcStride);
  1454. +
  1455. + PICO_LDCM_W_INC(tmp,
  1456. + PICO_REGVECT_VMU0_OUT,
  1457. + PICO_REGVECT_VMU1_OUT,
  1458. + PICO_REGVECT_VMU2_OUT);
  1459. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1460. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1461. + PICO_MVRC_W(PICO_INPIX2, src0);
  1462. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1463. + PICO_MVRC_W(PICO_INPIX2, src1);
  1464. + PICO_MVRC_W(PICO_INPIX1, src2);
  1465. + PICO_MVRC_W(PICO_INPIX0, src3);
  1466. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1467. +
  1468. + PICO_LDCM_W_INC(tmp,
  1469. + PICO_REGVECT_VMU0_OUT,
  1470. + PICO_REGVECT_VMU1_OUT,
  1471. + PICO_REGVECT_VMU2_OUT);
  1472. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1473. + PICO_MVRC_W(PICO_INPIX0, srcB);
  1474. + PICO_MVRC_W(PICO_INPIX1, srcA);
  1475. + PICO_MVRC_W(PICO_INPIX2, src0);
  1476. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1477. +
  1478. + PICO_LDCM_W_INC(tmp,
  1479. + PICO_REGVECT_VMU0_OUT,
  1480. + PICO_REGVECT_VMU1_OUT,
  1481. + PICO_REGVECT_VMU2_OUT);
  1482. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1483. + PICO_MVRC_W(PICO_INPIX1, src0);
  1484. + PICO_MVRC_W(PICO_INPIX2, src1);
  1485. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1486. + PICO_MVRC_W(PICO_INPIX2, src2);
  1487. + PICO_MVRC_W(PICO_INPIX1, src3);
  1488. + PICO_MVRC_W(PICO_INPIX0, src4);
  1489. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1490. +
  1491. + PICO_LDCM_W_INC(tmp,
  1492. + PICO_REGVECT_VMU0_OUT,
  1493. + PICO_REGVECT_VMU1_OUT,
  1494. + PICO_REGVECT_VMU2_OUT);
  1495. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1496. + PICO_MVRC_W(PICO_INPIX0, srcA);
  1497. + PICO_MVRC_W(PICO_INPIX1, src0);
  1498. + PICO_MVRC_W(PICO_INPIX2, src1);
  1499. + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1500. +
  1501. + ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
  1502. + ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0)));
  1503. +
  1504. +
  1505. + PICO_LDCM_W_INC(tmp,
  1506. + PICO_REGVECT_VMU0_OUT,
  1507. + PICO_REGVECT_VMU1_OUT,
  1508. + PICO_REGVECT_VMU2_OUT);
  1509. + PICO_MVRC_W(PICO_INPIX0, src0);
  1510. + PICO_MVRC_W(PICO_INPIX1, src1);
  1511. + PICO_MVRC_W(PICO_INPIX2, src2);
  1512. + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
  1513. + PICO_MVRC_W(PICO_INPIX2, src3);
  1514. + PICO_MVRC_W(PICO_INPIX1, src4);
  1515. + PICO_MVRC_W(PICO_INPIX0, src5);
  1516. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
  1517. +
  1518. + PICO_LDCM_W_INC(tmp,
  1519. + PICO_REGVECT_VMU0_OUT,
  1520. + PICO_REGVECT_VMU1_OUT,
  1521. + PICO_REGVECT_VMU2_OUT);
  1522. + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
  1523. + PICO_MVRC_W(PICO_INPIX0, src0);
  1524. + PICO_MVRC_W(PICO_INPIX1, src1);
  1525. + PICO_MVRC_W(PICO_INPIX2, src2);
  1526. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
  1527. +
  1528. + PICO_LDCM_W_INC(tmp,
  1529. + PICO_REGVECT_VMU0_OUT,
  1530. + PICO_REGVECT_VMU1_OUT,
  1531. + PICO_REGVECT_VMU2_OUT);
  1532. + PICO_MVRC_W(PICO_INPIX0, src1);
  1533. + PICO_MVRC_W(PICO_INPIX1, src2);
  1534. + PICO_MVRC_W(PICO_INPIX2, src3);
  1535. + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
  1536. + PICO_MVRC_W(PICO_INPIX2, src4);
  1537. + PICO_MVRC_W(PICO_INPIX1, src5);
  1538. + PICO_MVRC_W(PICO_INPIX0, src6);
  1539. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
  1540. +
  1541. + PICO_LDCM_W_INC(tmp,
  1542. + PICO_REGVECT_VMU0_OUT,
  1543. + PICO_REGVECT_VMU1_OUT,
  1544. + PICO_REGVECT_VMU2_OUT);
  1545. + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
  1546. + PICO_MVRC_W(PICO_INPIX0, src1);
  1547. + PICO_MVRC_W(PICO_INPIX1, src2);
  1548. + PICO_MVRC_W(PICO_INPIX2, src3);
  1549. + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
  1550. +
  1551. + ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
  1552. + ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0)));
  1553. +
  1554. + dst += 2;
  1555. + src += 2;
  1556. + }
  1557. +}
  1558. +
  1559. +
  1560. +static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1561. + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1562. + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1563. + src += 4*srcStride;
  1564. + dst += 4*dstStride;
  1565. + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1566. + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1567. +}
  1568. +
  1569. +static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1570. + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1571. + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1572. + src += 4*srcStride;
  1573. + dst += 4*dstStride;
  1574. + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
  1575. + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1576. +}
  1577. +
  1578. +static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1579. + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1580. + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1581. + src += 4*srcStride;
  1582. + dst += 4*dstStride;
  1583. + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1584. + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1585. +}
  1586. +
  1587. +static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1588. + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1589. + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1590. + src += 4*srcStride;
  1591. + dst += 4*dstStride;
  1592. + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
  1593. + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1594. +}
  1595. +
  1596. +static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1597. + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1598. + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1599. + src += 4*srcStride;
  1600. + dst += 4*dstStride;
  1601. + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1602. + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1603. +}
  1604. +
  1605. +static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1606. + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1607. + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1608. + src += 4*srcStride;
  1609. + dst += 4*dstStride;
  1610. + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1611. + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
  1612. +}
  1613. +
  1614. +static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1615. + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1616. + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1617. + src += 8*srcStride;
  1618. + dst += 8*dstStride;
  1619. + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1620. + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1621. +}
  1622. +
  1623. +static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1624. + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1625. + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1626. + src += 8*srcStride;
  1627. + dst += 8*dstStride;
  1628. + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
  1629. + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1630. +}
  1631. +
  1632. +static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1633. + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1634. + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1635. + src += 8*srcStride;
  1636. + dst += 8*dstStride;
  1637. + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1638. + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1639. +}
  1640. +
  1641. +static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1642. + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1643. + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1644. + src += 8*srcStride;
  1645. + dst += 8*dstStride;
  1646. + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
  1647. + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1648. +}
  1649. +
  1650. +static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1651. + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1652. + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1653. + src += 8*srcStride;
  1654. + dst += 8*dstStride;
  1655. + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1656. + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1657. +}
  1658. +
  1659. +static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
  1660. + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1661. + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1662. + src += 8*srcStride;
  1663. + dst += 8*dstStride;
  1664. + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
  1665. + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
  1666. +}
  1667. +
  1668. +
  1669. +#define H264_MC(OPNAME, SIZE) \
  1670. +static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\
  1671. + OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
  1672. +}\
  1673. +\
  1674. +static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\
  1675. + uint8_t half[SIZE*SIZE];\
  1676. + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
  1677. + OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
  1678. +}\
  1679. +\
  1680. +static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\
  1681. + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\
  1682. +}\
  1683. +\
  1684. +static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\
  1685. + uint8_t half[SIZE*SIZE];\
  1686. + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
  1687. + OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
  1688. +}\
  1689. +\
  1690. +static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\
  1691. + uint8_t full[SIZE*(SIZE+5)];\
  1692. + uint8_t * const full_mid= full + SIZE*2;\
  1693. + uint8_t half[SIZE*SIZE];\
  1694. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1695. + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
  1696. + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
  1697. +}\
  1698. +\
  1699. +static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\
  1700. + uint8_t full[SIZE*(SIZE+5)];\
  1701. + uint8_t * const full_mid= full + SIZE*2;\
  1702. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1703. + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\
  1704. +}\
  1705. +\
  1706. +static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\
  1707. + uint8_t full[SIZE*(SIZE+5)];\
  1708. + uint8_t * const full_mid= full + SIZE*2;\
  1709. + uint8_t half[SIZE*SIZE];\
  1710. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1711. + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
  1712. + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
  1713. +}\
  1714. +\
  1715. +static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\
  1716. + uint8_t full[SIZE*(SIZE+5)];\
  1717. + uint8_t * const full_mid= full + SIZE*2;\
  1718. + uint8_t halfH[SIZE*SIZE];\
  1719. + uint8_t halfV[SIZE*SIZE];\
  1720. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
  1721. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1722. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1723. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1724. +}\
  1725. +\
  1726. +static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\
  1727. + uint8_t full[SIZE*(SIZE+5)];\
  1728. + uint8_t * const full_mid= full + SIZE*2;\
  1729. + uint8_t halfH[SIZE*SIZE];\
  1730. + uint8_t halfV[SIZE*SIZE];\
  1731. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
  1732. + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1733. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1734. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1735. +}\
  1736. +\
  1737. +static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\
  1738. + uint8_t full[SIZE*(SIZE+5)];\
  1739. + uint8_t * const full_mid= full + SIZE*2;\
  1740. + uint8_t halfH[SIZE*SIZE];\
  1741. + uint8_t halfV[SIZE*SIZE];\
  1742. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
  1743. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1744. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1745. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1746. +}\
  1747. +\
  1748. +static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\
  1749. + uint8_t full[SIZE*(SIZE+5)];\
  1750. + uint8_t * const full_mid= full + SIZE*2;\
  1751. + uint8_t halfH[SIZE*SIZE];\
  1752. + uint8_t halfV[SIZE*SIZE];\
  1753. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
  1754. + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1755. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1756. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  1757. +}\
  1758. +\
  1759. +static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\
  1760. + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\
  1761. +}\
  1762. +\
  1763. +static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\
  1764. + uint8_t halfH[SIZE*SIZE];\
  1765. + uint8_t halfHV[SIZE*SIZE];\
  1766. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
  1767. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1768. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  1769. +}\
  1770. +\
  1771. +static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\
  1772. + uint8_t halfH[SIZE*SIZE];\
  1773. + uint8_t halfHV[SIZE*SIZE];\
  1774. + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
  1775. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1776. + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  1777. +}\
  1778. +\
  1779. +static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\
  1780. + uint8_t full[SIZE*(SIZE+5)];\
  1781. + uint8_t * const full_mid= full + SIZE*2;\
  1782. + uint8_t halfV[SIZE*SIZE];\
  1783. + uint8_t halfHV[SIZE*SIZE];\
  1784. + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
  1785. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1786. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1787. + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  1788. +}\
  1789. +\
  1790. +static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\
  1791. + uint8_t full[SIZE*(SIZE+5)];\
  1792. + uint8_t * const full_mid= full + SIZE*2;\
  1793. + uint8_t halfV[SIZE*SIZE];\
  1794. + uint8_t halfHV[SIZE*SIZE];\
  1795. + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
  1796. + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
  1797. + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
  1798. + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  1799. +}\
  1800. +
  1801. +H264_MC(put_, 4)
  1802. +H264_MC(put_, 8)
  1803. +H264_MC(put_, 16)
  1804. +H264_MC(avg_, 4)
  1805. +H264_MC(avg_, 8)
  1806. +H264_MC(avg_, 16)
  1807. +
  1808. +
  1809. +
  1810. +#define dspfunc16(PFX) \
  1811. + void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1812. + PFX ## _pixels8_avr32(dst, pixels, line_size, h);\
  1813. + PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\
  1814. + }\
  1815. + void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1816. + PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\
  1817. + PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\
  1818. + }\
  1819. + void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1820. + PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\
  1821. + PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\
  1822. + }\
  1823. + void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
  1824. + PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\
  1825. + PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\
  1826. + }\
  1827. +
  1828. +
  1829. +dspfunc16(put)
  1830. +dspfunc16(put_no_rnd)
  1831. +dspfunc16(avg)
  1832. +dspfunc16(avg_no_rnd)
  1833. +#undef dspfunc16
  1834. +
  1835. +static int pix_sum_avr32(uint8_t * pix, int line_size)
  1836. +{
  1837. + int s, i;
  1838. +
  1839. + s = 0;
  1840. + for (i = 0; i < 16; i++) {
  1841. + int tmp1,tmp2,tmp3,tmp4,tmp5;
  1842. + __asm__ volatile ( "ld.w\t%0, %6[0]\n\t"
  1843. + "ld.w\t%1, %6[4]\n\t"
  1844. + "ld.w\t%2, %6[8]\n\t"
  1845. + "ld.w\t%3, %6[12]\n\t"
  1846. + "punpckub.h\t%4, %0:t\n\t"
  1847. + "padd.h\t%5, %5, %4\n\t"
  1848. + "punpckub.h\t%4, %0:b\n\t"
  1849. + "padd.h\t%5, %5, %4\n\t"
  1850. + "punpckub.h\t%4, %1:t\n\t"
  1851. + "padd.h\t%5, %5, %4\n\t"
  1852. + "punpckub.h\t%4, %1:b\n\t"
  1853. + "padd.h\t%5, %5, %4\n\t"
  1854. + "punpckub.h\t%4, %2:t\n\t"
  1855. + "padd.h\t%5, %5, %4\n\t"
  1856. + "punpckub.h\t%4, %2:b\n\t"
  1857. + "padd.h\t%5, %5, %4\n\t"
  1858. + "punpckub.h\t%4, %3:t\n\t"
  1859. + "padd.h\t%5, %5, %4\n\t"
  1860. + "punpckub.h\t%4, %3:b\n\t"
  1861. + "padd.h\t%5, %5, %4\n\t"
  1862. + : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s)
  1863. + : "r"(pix));
  1864. + pix += line_size;
  1865. + }
  1866. + __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) );
  1867. +
  1868. + return s;
  1869. +}
  1870. +
  1871. +
  1872. +//#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
  1873. +//#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
  1874. +//#define H264_WEIGHT(W,H) \
  1875. +//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
  1876. +// int attribute_unused x, y; \
  1877. +// offset <<= log2_denom; \
  1878. +// if(log2_denom) offset += 1<<(log2_denom-1); \
  1879. +// for(y=0; y<H; y++, block += stride){ \
  1880. +// uint32_t tmp0, tmp1;
  1881. +// if(W==2) { \
  1882. +// asm volatile ( "ld.ub\t%[tmp0], %[block][0]\n" \
  1883. +// "ld.ub\t%[tmp1], %[block][1]\n" \
  1884. +// "mulhh.w\t%[tmp0], %[tmp0]:b, %[weight]:b\n" \
  1885. +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
  1886. +// "asr\t%[tmp0], %[log2_denom]\n" \
  1887. +// "asr\t%[tmp1], %[log2_denom]\n" \
  1888. +// "satu\t%[tmp0] >> 0, 8\n" \
  1889. +// "satu\t%[tmp1] >> 0, 8\n" \
  1890. +// "st.b\t%[block][0], %[tmp0]\n" \
  1891. +// "st.b\t%[block][1], %[tmp1]\n" \
  1892. +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
  1893. +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
  1894. +// } else if ( W==4 ) { \
  1895. +// asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \
  1896. +// "punpckub.h\t%[tmp1], %[tmp0]:t\n" \
  1897. +// "punpckub.h\t%[tmp0], %[tmp0]:b\n" \
  1898. +// "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \
  1899. +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
  1900. +// "asr\t%[tmp0], %[log2_denom]\n" \
  1901. +// "asr\t%[tmp1], %[log2_denom]\n" \
  1902. +// "satu\t%[tmp0] >> 0, 8\n" \
  1903. +// "satu\t%[tmp1] >> 0, 8\n" \
  1904. +// "st.b\t%[block][0], %[tmp0]\n" \
  1905. +// "st.b\t%[block][1], %[tmp1]\n" \
  1906. +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
  1907. +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
  1908. +//
  1909. +//
  1910. +//
  1911. +// if(W==4) continue; \
  1912. +// op_scale1(4); \
  1913. +// op_scale1(5); \
  1914. +// op_scale1(6); \
  1915. +// op_scale1(7); \
  1916. +// if(W==8) continue; \
  1917. +// op_scale1(8); \
  1918. +// op_scale1(9); \
  1919. +// op_scale1(10); \
  1920. +// op_scale1(11); \
  1921. +// op_scale1(12); \
  1922. +// op_scale1(13); \
  1923. +// op_scale1(14); \
  1924. +// op_scale1(15); \
  1925. +// } \
  1926. +//} \
  1927. +//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
  1928. +// int attribute_unused x, y; \
  1929. +// int offset = (offsets + offsetd + 1) >> 1; \
  1930. +// offset = ((offset << 1) + 1) << log2_denom; \
  1931. +// for(y=0; y<H; y++, dst += stride, src += stride){ \
  1932. +// op_scale2(0); \
  1933. +// op_scale2(1); \
  1934. +// if(W==2) continue; \
  1935. +// op_scale2(2); \
  1936. +// op_scale2(3); \
  1937. +// if(W==4) continue; \
  1938. +// op_scale2(4); \
  1939. +// op_scale2(5); \
  1940. +// op_scale2(6); \
  1941. +// op_scale2(7); \
  1942. +// if(W==8) continue; \
  1943. +// op_scale2(8); \
  1944. +// op_scale2(9); \
  1945. +// op_scale2(10); \
  1946. +// op_scale2(11); \
  1947. +// op_scale2(12); \
  1948. +// op_scale2(13); \
  1949. +// op_scale2(14); \
  1950. +// op_scale2(15); \
  1951. +// } \
  1952. +//}
  1953. +
  1954. +
  1955. +
  1956. +/* Returns zero in each byte where the absolute difference between <a> and <b>
  1957. + is not less than <compare> */
  1958. +#define PABS_DIFF_LESS_THAN( a, b, compare) \
  1959. + ({ uint32_t __tmp__, __tmp2__, __mask__; \
  1960. + asm ( \
  1961. + /* Check ABS( a - b ) < compare */ \
  1962. + "psubs.ub\t%[tmp], %[opa], %[opb]\n" \
  1963. + "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \
  1964. + "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \
  1965. + /* This produces 0 for all bytes where the comparison is not true */ \
  1966. + "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \
  1967. + : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \
  1968. + : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \
  1969. + __mask__; })
  1970. +
  1971. +/*
  1972. + Set all bytes containing zero in <value> to 255 and the rest to zero.
  1973. +
  1974. + Add with saturation 254 to all bytes making all bytes different from
  1975. + zero become 255. Then add one without saturation to make all bytes
  1976. + originally containing zero 255 and the rest 0. */
  1977. +#define SET_ALL_BITS_IN_ZERO_BYTES(value) \
  1978. + ({ uint32_t __tmp__; \
  1979. + asm ( \
  1980. + "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \
  1981. + "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \
  1982. + : [tmp] "=r"(__tmp__) \
  1983. + : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \
  1984. + __tmp__; })
  1985. +
  1986. +#define PACKW_SH(upper, lower) \
  1987. + ({ uint32_t __tmp__; \
  1988. + asm ( \
  1989. + "packw.sh\t%[tmp], %[u], %[l]\n" \
  1990. + : [tmp] "=r"(__tmp__) \
  1991. + : [u] "r"(upper), [l] "r"(lower) ); \
  1992. + __tmp__; })
  1993. +
  1994. +#define PACKSH_UB(upper, lower) \
  1995. + ({ uint32_t __tmp__; \
  1996. + asm ( \
  1997. + "packsh.sb\t%[tmp], %[u], %[l]\n" \
  1998. + : [tmp] "=r"(__tmp__) \
  1999. + : [u] "r"(upper), [l] "r"(lower) ); \
  2000. + __tmp__; })
  2001. +
  2002. +static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  2003. +{
  2004. + int i;
  2005. +
  2006. + if ( alpha == 0 )
  2007. + return;
  2008. +
  2009. + alpha = PACKW_SH(alpha, alpha);
  2010. + alpha = PACKSH_UB(alpha, alpha);
  2011. + beta = PACKW_SH(beta, beta);
  2012. + beta = PACKSH_UB(beta, beta);
  2013. +
  2014. + for( i = 0; i < 4; i++ ) {
  2015. + uint32_t p0, p1, p2, q0, q1, q2;
  2016. + uint32_t mask, mask2;
  2017. + uint32_t tmp, tmp2, tmp3, tmp4;
  2018. +
  2019. + if( tc0[i] < 0 ) {
  2020. + pix += 4;
  2021. + continue;
  2022. + }
  2023. +
  2024. +/* for( d = 0; d < 4; d++ ) {
  2025. + const int p0 = pix[-1*stride];
  2026. + const int p1 = pix[-2*stride];
  2027. + const int p2 = pix[-3*stride];
  2028. + const int q0 = pix[0];
  2029. + const int q1 = pix[1*stride];
  2030. + const int q2 = pix[2*stride];
  2031. +
  2032. + if( ABS( p0 - q0 ) < alpha &&
  2033. + ABS( p1 - p0 ) < beta &&
  2034. + ABS( q1 - q0 ) < beta ) { */
  2035. +
  2036. + p0 = LD32(pix - stride);
  2037. + p1 = LD32(pix - 2*stride);
  2038. + q0 = LD32(pix);
  2039. + q1 = LD32(pix + stride);
  2040. +
  2041. + /* Check which of the columns should be filtered, if any. */
  2042. + mask = PABS_DIFF_LESS_THAN(p0, q0, alpha);
  2043. + mask |= PABS_DIFF_LESS_THAN(p1, p0, beta);
  2044. + mask |= PABS_DIFF_LESS_THAN(q1, q0, beta);
  2045. +
  2046. + if ( !mask )
  2047. + continue;
  2048. +
  2049. + mask = SET_ALL_BITS_IN_ZERO_BYTES(mask);
  2050. +
  2051. +
  2052. + int tc = PACKW_SH(tc0[i], tc0[i]);
  2053. + int tc0_p = tc;
  2054. + int tc0_m = PACKW_SH(-tc0[i], -tc0[i]);
  2055. +
  2056. + /*
  2057. + int i_delta;
  2058. + if( ABS( p2 - p0 ) < beta ) {
  2059. + pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
  2060. + tc++;
  2061. + }*/
  2062. +
  2063. + p2 = LD32(pix - 3*stride);
  2064. + mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask;
  2065. +
  2066. + if ( mask2 ){
  2067. + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
  2068. + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
  2069. + "paddh.ub\t%[tmp], %[tmp], %[p2]\n"
  2070. + "punpckub.h\t%[tmp2], %[tmp]:t\n"
  2071. + "punpckub.h\t%[tmp], %[tmp]:b\n"
  2072. + "punpckub.h\t%[tmp3], %[p1]:t\n"
  2073. + "punpckub.h\t%[tmp4], %[p1]:b\n"
  2074. + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2075. + "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
  2076. + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
  2077. + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
  2078. + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
  2079. + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
  2080. + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2081. + "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
  2082. + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
  2083. + "andn\t%[tmp], %[mask2]\n"
  2084. + "and\t%[tmp2], %[q1], %[mask2]\n"
  2085. + "or\t%[tmp], %[tmp2]\n"
  2086. + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
  2087. + [tmp4]"=&r"(tmp4)
  2088. + : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p),
  2089. + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
  2090. + ST32(pix - 2*stride, tmp);
  2091. + tc += 0x00010001;
  2092. + }
  2093. +
  2094. +
  2095. + q2 = LD32(pix + 2*stride);
  2096. +
  2097. + /*
  2098. + if( ABS( q2 - q0 ) < beta ) {
  2099. + pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
  2100. + tc++;
  2101. + }
  2102. + */
  2103. + mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask;
  2104. +
  2105. + if ( mask2 ){
  2106. + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
  2107. + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
  2108. + "paddh.ub\t%[tmp], %[tmp], %[q2]\n"
  2109. + "punpckub.h\t%[tmp2], %[tmp]:t\n"
  2110. + "punpckub.h\t%[tmp], %[tmp]:b\n"
  2111. + "punpckub.h\t%[tmp3], %[q1]:t\n"
  2112. + "punpckub.h\t%[tmp4], %[q1]:b\n"
  2113. + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2114. + "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
  2115. + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
  2116. + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
  2117. + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
  2118. + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
  2119. + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
  2120. + "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
  2121. + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
  2122. + "andn\t%[tmp], %[mask2]\n"
  2123. + "and\t%[tmp2], %[q1], %[mask2]\n"
  2124. + "or\t%[tmp], %[tmp2]\n"
  2125. + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
  2126. + [tmp4]"=&r"(tmp4)
  2127. + : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p),
  2128. + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
  2129. + ST32(pix + stride, tmp);
  2130. + tc += 0x00010001;
  2131. + }
  2132. +
  2133. + uint32_t old_p0 = p0;
  2134. + uint32_t old_q0 = q0;
  2135. +
  2136. + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
  2137. + pix[-stride] = clip_uint8( p0 + i_delta );
  2138. + pix[0] = clip_uint8( q0 - i_delta ); */
  2139. +
  2140. + asm (
  2141. + /* Check if the two upper pixels should be filtered */
  2142. + "lsr\t%[tmp], %[inv_mask], 16\n"
  2143. + "breq\t0f\n"
  2144. +
  2145. + "punpckub.h\t%[tmp], %[p1]:t\n"
  2146. + "punpckub.h\t%[tmp2], %[q1]:t\n"
  2147. +
  2148. + /* p1 - q1 */
  2149. + "psub.h\t%[tmp], %[tmp], %[tmp2]\n"
  2150. +
  2151. + "punpckub.h\t%[tmp3], %[q0]:t\n"
  2152. + "punpckub.h\t%[tmp4], %[p0]:t\n"
  2153. +
  2154. + /* q0 - p0 */
  2155. + "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n"
  2156. +
  2157. + /* (q0 - p0) << 2 */
  2158. + "plsl.h\t%[tmp2], %[tmp2], 2\n"
  2159. +
  2160. + /* ((q0 - p0) << 2) + (p1 - q1) */
  2161. + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
  2162. +
  2163. + "mov\t%[tmp], 0x00040004\n"
  2164. + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
  2165. + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
  2166. +
  2167. + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
  2168. + "pasr.h\t%[tmp2], %[tmp2], 3\n"
  2169. +
  2170. + "mov\t%[tmp], 0\n"
  2171. + "psub.h\t%[tmp], %[tmp], %[tc]\n"
  2172. +
  2173. + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
  2174. + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
  2175. + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
  2176. +
  2177. +
  2178. + /* pix[-stride] = clip_uint8( p0 + i_delta ); */
  2179. + "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n"
  2180. +
  2181. +
  2182. + /* pix[0] = clip_uint8( q0 - i_delta ); */
  2183. + "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n"
  2184. +
  2185. + /* Check if the two lower pixels should be filtered */
  2186. + "lsl\t%[tmp2], %[inv_mask], 16\n"
  2187. + "breq\t1f\n"
  2188. +
  2189. + "0:\n"
  2190. + "punpckub.h\t%[p1], %[p1]:b\n"
  2191. + "punpckub.h\t%[q1], %[q1]:b\n"
  2192. +
  2193. + /* p1 - q1 */
  2194. + "psub.h\t%[p1], %[p1], %[q1]\n"
  2195. +
  2196. + "punpckub.h\t%[q0], %[q0]:b\n"
  2197. + "punpckub.h\t%[p0], %[p0]:b\n"
  2198. +
  2199. + /* q0 - p0 */
  2200. + "psub.h\t%[tmp2], %[q0], %[p0]\n"
  2201. +
  2202. + /* (q0 - p0) << 2 */
  2203. + "plsl.h\t%[tmp2], %[tmp2], 2\n"
  2204. +
  2205. + /* ((q0 - p0) << 2) + (p1 - q1) */
  2206. + "padd.h\t%[tmp2], %[tmp2], %[p1]\n"
  2207. +
  2208. + "mov\t%[q1], 0x00040004\n"
  2209. + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
  2210. + "padd.h\t%[tmp2], %[tmp2], %[q1]\n"
  2211. +
  2212. + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
  2213. + "pasr.h\t%[tmp2], %[tmp2], 3\n"
  2214. +
  2215. + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
  2216. + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
  2217. + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
  2218. +
  2219. + /* pix[-stride] = clip_uint8( p0 + i_delta ); */
  2220. + "padd.h\t%[p0], %[p0], %[tmp2]\n"
  2221. +
  2222. + /* pix[0] = clip_uint8( q0 - i_delta ); */
  2223. + "psub.h\t%[q0], %[q0], %[tmp2]\n"
  2224. +
  2225. + "1:\n"
  2226. + "packsh.ub\t%[p0], %[tmp4], %[p0]\n"
  2227. + "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n"
  2228. +
  2229. + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
  2230. + [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1)
  2231. + : [tc]"r"(tc), [inv_mask]"r"(~mask));
  2232. +
  2233. + ST32(pix - stride, (mask & old_p0) | (p0 & ~mask));
  2234. + ST32(pix, (mask & old_q0) | (q0 & ~mask));
  2235. +
  2236. + }
  2237. + pix += 1;
  2238. +}
  2239. +
  2240. +
  2241. +
  2242. +
  2243. +#ifdef CHECK_DSP_FUNCS_AGAINST_C
  2244. +
  2245. +void dump_block8(uint8_t *block, int line_size, int h){
  2246. + int i, j;
  2247. +
  2248. + for ( i = 0; i < h ; i++ ){
  2249. + av_log(NULL, AV_LOG_ERROR, "\t");
  2250. + for ( j = 0; j < 8 ; j++ ){
  2251. + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
  2252. + }
  2253. + av_log(NULL, AV_LOG_ERROR, "\n");
  2254. + }
  2255. +}
  2256. +
  2257. +void dump_block4(uint8_t *block, int line_size, int h){
  2258. + int i, j;
  2259. +
  2260. + for ( i = 0; i < h ; i++ ){
  2261. + av_log(NULL, AV_LOG_ERROR, "\t");
  2262. + for ( j = 0; j < 4 ; j++ ){
  2263. + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
  2264. + }
  2265. + av_log(NULL, AV_LOG_ERROR, "\n");
  2266. + }
  2267. +}
  2268. +
  2269. +void dump_block(uint8_t *block, int line_size, int h, int w){
  2270. + int i, j;
  2271. +
  2272. + for ( i = 0; i < h ; i++ ){
  2273. + av_log(NULL, AV_LOG_ERROR, "\t");
  2274. + for ( j = 0; j < w ; j++ ){
  2275. + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
  2276. + }
  2277. + av_log(NULL, AV_LOG_ERROR, "\n");
  2278. + }
  2279. +}
  2280. +
  2281. +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  2282. + int h, char *name, int max_dev){
  2283. + int i,j;
  2284. + for ( i = 0; i < 8 ; i++ ){
  2285. + for ( j = 0; j < h ; j++ ){
  2286. + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
  2287. + diff = diff < 0 ? -diff : diff;
  2288. + if ( diff > max_dev ){
  2289. + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
  2290. + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
  2291. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
  2292. + dump_block8(test, line_size_test, h);
  2293. + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
  2294. + dump_block8(correct, line_size_correct, h);
  2295. + exit(1);
  2296. + }
  2297. + }
  2298. + }
  2299. +}
  2300. +
  2301. +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  2302. + int h, char *name, int max_dev){
  2303. + int i,j;
  2304. + for ( i = 0; i < 4 ; i++ ){
  2305. + for ( j = 0; j < h ; j++ ){
  2306. + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
  2307. + diff = diff < 0 ? -diff : diff;
  2308. + if ( diff > max_dev ){
  2309. + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
  2310. + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
  2311. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
  2312. + dump_block8(test, line_size_test, h);
  2313. + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
  2314. + dump_block4(correct, line_size_correct, h);
  2315. + exit(1);
  2316. + }
  2317. + }
  2318. + }
  2319. +}
  2320. +
  2321. +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
  2322. + int h, int width, char *name, int max_dev){
  2323. + int i,j;
  2324. + for ( i = 0; i < width ; i++ ){
  2325. + for ( j = 0; j < h ; j++ ){
  2326. + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
  2327. + diff = diff < 0 ? -diff : diff;
  2328. + if ( diff > max_dev ){
  2329. + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
  2330. + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
  2331. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
  2332. + dump_block(test, line_size_test, h, width);
  2333. + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
  2334. + dump_block(correct, line_size_correct, h, width);
  2335. + exit(1);
  2336. + }
  2337. + }
  2338. + }
  2339. +}
  2340. +
  2341. +void dump_dct_block(DCTELEM *block){
  2342. + int i, j;
  2343. +
  2344. + for ( i = 0; i < 8 ; i++ ){
  2345. + av_log(NULL, AV_LOG_ERROR, "\t");
  2346. + for ( j = 0; j < 8 ; j++ ){
  2347. + av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]);
  2348. + }
  2349. + av_log(NULL, AV_LOG_ERROR, "\n");
  2350. + }
  2351. +}
  2352. +
  2353. +void test_idct_avr32(DCTELEM *block){
  2354. + DCTELEM testBlock[64];
  2355. + int i, j;
  2356. +
  2357. + /* Copy transposed block to testBlock */
  2358. + for ( i = 0; i < 8 ; i++ ){
  2359. + for ( j = 0; j < 8 ; j++ ){
  2360. + testBlock[i + 8*j] = block[j + i*8];
  2361. + }
  2362. + }
  2363. +
  2364. + idct_avr32(block);
  2365. + simple_idct(&testBlock);
  2366. +
  2367. + for ( i = 0; i < 64 ; i++ ){
  2368. + if ( block[i] != testBlock[i] ){
  2369. + av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n");
  2370. + dump_dct_block(block);
  2371. + av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n");
  2372. + dump_dct_block(testBlock);
  2373. + exit(1);
  2374. + }
  2375. + }
  2376. +}
  2377. +
  2378. +void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){
  2379. + uint8_t testBlock[64];
  2380. + DCTELEM blockCopy[64];
  2381. + int i, j;
  2382. +
  2383. + /* Copy transposed block to blockCopy */
  2384. + for ( i = 0; i < 8 ; i++ ){
  2385. + for ( j = 0; j < 8 ; j++ ){
  2386. + blockCopy[i + 8*j] = block[j + i*8];
  2387. + }
  2388. + }
  2389. +
  2390. + idct_put_avr32(dest, line_size, block);
  2391. + simple_idct_put(&testBlock, 8, blockCopy);
  2392. +
  2393. + check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1);
  2394. +}
  2395. +
  2396. +
  2397. +void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){
  2398. + uint8_t testBlock[64];
  2399. + DCTELEM blockCopy[64];
  2400. + int i, j;
  2401. +
  2402. + /* Copy dest to testBlock */
  2403. + for ( i = 0; i < 8 ; i++ ){
  2404. + for ( j = 0; j < 8 ; j++ ){
  2405. + testBlock[i + 8*j] = dest[i + j*line_size];
  2406. + }
  2407. + }
  2408. +
  2409. + /* Copy transposed block to blockCopy */
  2410. + for ( i = 0; i < 8 ; i++ ){
  2411. + for ( j = 0; j < 8 ; j++ ){
  2412. + blockCopy[i + 8*j] = block[j + i*8];
  2413. + }
  2414. + }
  2415. +
  2416. + idct_add_avr32(dest, line_size, block);
  2417. + simple_idct_add(&testBlock, 8, blockCopy);
  2418. +
  2419. + check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1);
  2420. +}
  2421. +
  2422. +void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
  2423. + uint8_t testBlock[16];
  2424. + DCTELEM blockCopy[16];
  2425. + int i, j;
  2426. +
  2427. + /* Copy dest to testBlock */
  2428. + for ( i = 0; i < 4 ; i++ ){
  2429. + for ( j = 0; j < 4 ; j++ ){
  2430. + testBlock[i + 4*j] = dest[i + j*stride];
  2431. + }
  2432. + }
  2433. +
  2434. + /* Copy transposed block to blockCopy */
  2435. + for ( i = 0; i < 16 ; i++ ){
  2436. + blockCopy[i] = block[i];
  2437. + }
  2438. +
  2439. + ff_h264_idct_add_c(dest, block, stride);
  2440. +
  2441. + h264_idct_add_avr32(testBlock, blockCopy, 4);
  2442. +
  2443. + check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0);
  2444. +}
  2445. +
  2446. +void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
  2447. + uint8_t testBlock[8*8];
  2448. + DCTELEM blockCopy[8*8];
  2449. + int i, j;
  2450. +
  2451. + /* Copy dest to testBlock */
  2452. + for ( i = 0; i < 8 ; i++ ){
  2453. + for ( j = 0; j < 8 ; j++ ){
  2454. + testBlock[i + 8*j] = dest[i + j*stride];
  2455. + }
  2456. + }
  2457. +
  2458. + /* Copy source block to blockCopy */
  2459. + for ( i = 0; i < 8*8 ; i++ ){
  2460. + blockCopy[i] = block[i];
  2461. + }
  2462. +
  2463. + ff_h264_idct8_add_c(dest, block, stride);
  2464. + h264_idct8_add_avr32(testBlock, blockCopy, 8);
  2465. +
  2466. + check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0);
  2467. +}
  2468. +
  2469. +void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block,
  2470. + const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){
  2471. + uint8_t *testBlock, *testBlock2;
  2472. + int i, j;
  2473. + int input_v_size = h + in_v_size;
  2474. + int input_h_size = 8 + in_h_size;
  2475. +
  2476. + testBlock = alloca(input_h_size*input_v_size);
  2477. + testBlock2 = alloca(input_h_size*input_v_size);
  2478. +
  2479. + for ( i = 0; i < input_h_size ; i++ ){
  2480. + for ( j = 0; j < input_v_size ; j++ ){
  2481. + testBlock[i + input_h_size*j] = pixels[i + j*line_size];
  2482. + }
  2483. + }
  2484. +
  2485. + test(block, pixels, line_size, h);
  2486. + correct(testBlock2, testBlock, input_h_size, h);
  2487. +
  2488. + check_block8(block, testBlock2, line_size, input_h_size, h, name, 0);
  2489. +
  2490. +}
  2491. +
  2492. +void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst,
  2493. + uint8_t *src, int stride, int h, int w, int x, int y, char *name){
  2494. + uint8_t *testBlock, *testBlock2;
  2495. + int i, j;
  2496. + int input_v_size = h + 1;
  2497. + int input_h_size = ((w + 1) + 3) & ~3;
  2498. +
  2499. + testBlock = alloca(input_h_size*input_v_size);
  2500. + testBlock2 = alloca(input_h_size*input_v_size);
  2501. +
  2502. + for ( i = 0; i < w + 1 ; i++ ){
  2503. + for ( j = 0; j < h + 1 ; j++ ){
  2504. + testBlock[i + input_h_size*j] = src[i + j*stride];
  2505. + }
  2506. + }
  2507. +
  2508. + for ( i = 0; i < w ; i++ ){
  2509. + for ( j = 0; j < h ; j++ ){
  2510. + testBlock2[i + input_h_size*j] = dst[i + j*stride];
  2511. + }
  2512. + }
  2513. +
  2514. + test(dst, src, stride, h, x, y);
  2515. + correct(testBlock2, testBlock, input_h_size, h, x, y);
  2516. +
  2517. + check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0);
  2518. +
  2519. +}
  2520. +
  2521. +void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst,
  2522. + uint8_t *src, int stride, int size, char *name){
  2523. + uint8_t *testBlock, *testBlock2;
  2524. + int i, j;
  2525. + int test_stride = size + 8;
  2526. +
  2527. + testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4;
  2528. + testBlock2 = alloca(test_stride*size);
  2529. +
  2530. + for ( i = -4; i < size+4 ; i++ ){
  2531. + for ( j = -4; j < size+4 ; j++ ){
  2532. + testBlock[i + test_stride*j] = src[i + j*stride];
  2533. + }
  2534. + }
  2535. +
  2536. + for ( i = 0; i < size ; i++ ){
  2537. + for ( j = 0; j < size ; j++ ){
  2538. + testBlock2[i + test_stride*j] = dst[i + j*stride];
  2539. + }
  2540. + }
  2541. +
  2542. + correct(dst, src, stride);
  2543. + test(testBlock2, testBlock, test_stride);
  2544. +
  2545. + check_block(testBlock2, dst, test_stride, stride, size, size, name, 0);
  2546. +
  2547. +}
  2548. +
  2549. +
  2550. +#define test_pixels_funcs(PFX, NUM ) \
  2551. +void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2552. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \
  2553. + block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \
  2554. +void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2555. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \
  2556. + block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \
  2557. +void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2558. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \
  2559. + block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \
  2560. +void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
  2561. + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \
  2562. + block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); }
  2563. +
  2564. +test_pixels_funcs(put, 8);
  2565. +test_pixels_funcs(put_no_rnd, 8);
  2566. +test_pixels_funcs(put, 16);
  2567. +test_pixels_funcs(put_no_rnd, 16);
  2568. +
  2569. +test_pixels_funcs(avg, 8);
  2570. +test_pixels_funcs(avg_no_rnd, 8);
  2571. +test_pixels_funcs(avg, 16);
  2572. +test_pixels_funcs(avg_no_rnd, 16);
  2573. +
  2574. +#define test_h264_chroma_mc_funcs(PFX, NUM ) \
  2575. +void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \
  2576. + test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \
  2577. + dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \
  2578. +
  2579. +test_h264_chroma_mc_funcs(put, 2);
  2580. +test_h264_chroma_mc_funcs(put, 4);
  2581. +test_h264_chroma_mc_funcs(put, 8);
  2582. +test_h264_chroma_mc_funcs(avg, 2);
  2583. +test_h264_chroma_mc_funcs(avg, 4);
  2584. +test_h264_chroma_mc_funcs(avg, 8);
  2585. +
  2586. +#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \
  2587. +void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \
  2588. + test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \
  2589. + dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); }
  2590. +
  2591. +#define test_qpel_mc_funcs(PFX, NUM) \
  2592. + test_qpel_mc_funcs_type(PFX, NUM, mc00);\
  2593. + test_qpel_mc_funcs_type(PFX, NUM, mc10);\
  2594. + test_qpel_mc_funcs_type(PFX, NUM, mc20);\
  2595. + test_qpel_mc_funcs_type(PFX, NUM, mc30);\
  2596. + test_qpel_mc_funcs_type(PFX, NUM, mc01);\
  2597. + test_qpel_mc_funcs_type(PFX, NUM, mc11);\
  2598. + test_qpel_mc_funcs_type(PFX, NUM, mc21);\
  2599. + test_qpel_mc_funcs_type(PFX, NUM, mc31);\
  2600. + test_qpel_mc_funcs_type(PFX, NUM, mc02);\
  2601. + test_qpel_mc_funcs_type(PFX, NUM, mc12);\
  2602. + test_qpel_mc_funcs_type(PFX, NUM, mc22);\
  2603. + test_qpel_mc_funcs_type(PFX, NUM, mc32);\
  2604. + test_qpel_mc_funcs_type(PFX, NUM, mc03);\
  2605. + test_qpel_mc_funcs_type(PFX, NUM, mc13);\
  2606. + test_qpel_mc_funcs_type(PFX, NUM, mc23);\
  2607. + test_qpel_mc_funcs_type(PFX, NUM, mc33)
  2608. +
  2609. +test_qpel_mc_funcs(put_h264_qpel, 4);
  2610. +test_qpel_mc_funcs(put_h264_qpel, 8);
  2611. +test_qpel_mc_funcs(put_h264_qpel, 16);
  2612. +test_qpel_mc_funcs(avg_h264_qpel, 4);
  2613. +test_qpel_mc_funcs(avg_h264_qpel, 8);
  2614. +test_qpel_mc_funcs(avg_h264_qpel, 16);
  2615. +
  2616. +
  2617. +#define dspfunc(PFX, IDX, NUM) \
  2618. + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
  2619. + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
  2620. + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
  2621. + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
  2622. + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
  2623. + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
  2624. + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
  2625. + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
  2626. + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
  2627. + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
  2628. + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
  2629. + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
  2630. + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
  2631. + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
  2632. + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
  2633. + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
  2634. +
  2635. +#endif
  2636. +
  2637. +void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx)
  2638. +{
  2639. +
  2640. + /* H264 */
  2641. +
  2642. + if ( 0 /*avr32_use_pico*/ ){
  2643. + c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico);
  2644. + c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico);
  2645. + c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico);
  2646. +
  2647. + c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico);
  2648. + c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico);
  2649. + c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico);
  2650. + }
  2651. +
  2652. +#define dspfunc(PFX, IDX, NUM) \
  2653. + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
  2654. + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
  2655. + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
  2656. + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
  2657. + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
  2658. + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
  2659. + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
  2660. + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
  2661. + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
  2662. + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
  2663. + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
  2664. + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
  2665. + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
  2666. + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
  2667. + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
  2668. + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
  2669. +
  2670. + if ( avr32_use_pico ){
  2671. + dspfunc(put_h264_qpel, 0, 16);
  2672. + dspfunc(put_h264_qpel, 1, 8);
  2673. + dspfunc(put_h264_qpel, 2, 4);
  2674. + dspfunc(avg_h264_qpel, 0, 16);
  2675. + dspfunc(avg_h264_qpel, 1, 8);
  2676. + dspfunc(avg_h264_qpel, 2, 4);
  2677. + }
  2678. +
  2679. + c->idct_put= DSP_FUNC_NAME(idct_put_avr32);
  2680. + c->idct_add= DSP_FUNC_NAME(idct_add_avr32);
  2681. + c->idct = DSP_FUNC_NAME(idct_avr32);
  2682. + c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32);
  2683. + c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32);
  2684. +
  2685. + /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/
  2686. +
  2687. + c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
  2688. +
  2689. + c->fdct = fdct_avr32;
  2690. +
  2691. + c->clear_blocks = clear_blocks_avr32;
  2692. +
  2693. +#undef dspfunc
  2694. +#define dspfunc(PFX, IDX, NUM) \
  2695. + c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \
  2696. + c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \
  2697. + c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \
  2698. + c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32)
  2699. +
  2700. + dspfunc(put, 0, 16);
  2701. + dspfunc(put_no_rnd, 0, 16);
  2702. + dspfunc(put, 1, 8);
  2703. + dspfunc(put_no_rnd, 1, 8);
  2704. +
  2705. + dspfunc(avg, 1, 8);
  2706. + dspfunc(avg_no_rnd, 1, 8);
  2707. + dspfunc(avg, 0, 16);
  2708. + dspfunc(avg_no_rnd, 0, 16);
  2709. +#undef dspfunc
  2710. +
  2711. +}
  2712. +
  2713. +
  2714. +
  2715. +#if 0
  2716. +int main(int argc, char *argv[]){
  2717. +
  2718. +
  2719. +}
  2720. +#endif
  2721. +
  2722. --- /dev/null
  2723. +++ b/libavcodec/avr32/fdct.S
  2724. @@ -0,0 +1,541 @@
  2725. +/*
  2726. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  2727. + *
  2728. + * Redistribution and use in source and binary forms, with or without
  2729. + * modification, are permitted provided that the following conditions
  2730. + * are met:
  2731. + *
  2732. + * 1. Redistributions of source code must retain the above copyright
  2733. + * notice, this list of conditions and the following disclaimer.
  2734. + *
  2735. + * 2. Redistributions in binary form must reproduce the above
  2736. + * copyright notice, this list of conditions and the following
  2737. + * disclaimer in the documentation and/or other materials provided
  2738. + * with the distribution.
  2739. + *
  2740. + * 3. The name of ATMEL may not be used to endorse or promote products
  2741. + * derived from this software without specific prior written
  2742. + * permission.
  2743. + *
  2744. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  2745. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  2746. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  2747. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  2748. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  2749. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  2750. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  2751. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  2752. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  2753. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  2754. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  2755. + * DAMAGE.
  2756. + */
  2757. +
  2758. +//**********************************************************
  2759. +//* 2-D fDCT, Based on: *
  2760. +//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical *
  2761. +//* Fast 1-D DCT Algorithms with 11 Multiplications", *
  2762. +//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal *
  2763. +//* Processing 1989 (ICASSP '89), pp. 988-991. *
  2764. +//* *
  2765. +//* Fixed point implementation optimized for the AVR-II *
  2766. +//* instruction set. If a table is used for the *
  2767. +//* coeffisients we can load two and two of them from *
  2768. +//* This will give a reduction of
  2769. +//* *
  2770. +//* *
  2771. +//**********************************************************
  2772. +
  2773. +
  2774. +/* This routine is a slow-but-accurate integer implementation of the
  2775. + * forward DCT (Discrete Cosine Transform). Taken from the IJG software
  2776. + *
  2777. + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  2778. + * on each column. Direct algorithms are also available, but they are
  2779. + * much more complex and seem not to be any faster when reduced to code.
  2780. + *
  2781. + * This implementation is based on an algorithm described in
  2782. + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  2783. + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  2784. + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  2785. + * The primary algorithm described there uses 11 multiplies and 29 adds.
  2786. + * We use their alternate method with 12 multiplies and 32 adds.
  2787. + * The advantage of this method is that no data path contains more than one
  2788. + * multiplication; this allows a very simple and accurate implementation in
  2789. + * scaled fixed-point arithmetic, with a minimal number of shifts.
  2790. + *
  2791. + * The poop on this scaling stuff is as follows:
  2792. + *
  2793. + * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
  2794. + * larger than the true DCT outputs. The final outputs are therefore
  2795. + * a factor of N larger than desired; since N=8 this can be cured by
  2796. + * a simple right shift at the end of the algorithm. The advantage of
  2797. + * this arrangement is that we save two multiplications per 1-D DCT,
  2798. + * because the y0 and y4 outputs need not be divided by sqrt(N).
  2799. + * In the IJG code, this factor of 8 is removed by the quantization step
  2800. + * (in jcdctmgr.c), here it is removed.
  2801. + *
  2802. + * We have to do addition and subtraction of the integer inputs, which
  2803. + * is no problem, and multiplication by fractional constants, which is
  2804. + * a problem to do in integer arithmetic. We multiply all the constants
  2805. + * by CONST_SCALE and convert them to integer constants (thus retaining
  2806. + * CONST_BITS bits of precision in the constants). After doing a
  2807. + * multiplication we have to divide the product by CONST_SCALE, with proper
  2808. + * rounding, to produce the correct output. This division can be done
  2809. + * cheaply as a right shift of CONST_BITS bits. We postpone shifting
  2810. + * as long as possible so that partial sums can be added together with
  2811. + * full fractional precision.
  2812. + *
  2813. + * The outputs of the first pass are scaled up by PASS1_BITS bits so that
  2814. + * they are represented to better-than-integral precision. These outputs
  2815. + * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word
  2816. + * with the recommended scaling. (For 12-bit sample data, the intermediate
  2817. + * array is INT32 anyway.)
  2818. + *
  2819. + * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  2820. + * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis
  2821. + * shows that the values given below are the most effective.
  2822. + *
  2823. + * We can gain a little more speed, with a further compromise in accuracy,
  2824. + * by omitting the addition in a descaling shift. This yields an incorrectly
  2825. + * rounded result half the time...
  2826. + */
  2827. +
  2828. + .global fdct_avr32
  2829. +
  2830. +
  2831. +
  2832. +#define CONST_BITS 13
  2833. +#define PASS1_BITS 2
  2834. +
  2835. +#define FIX_0_298631336 2446 /* FIX(0.298631336) */
  2836. +#define FIX_0_390180644 3196 /* FIX(0.390180644) */
  2837. +#define FIX_0_541196100 4433 /* FIX(0.541196100) */
  2838. +#define FIX_0_765366865 6270 /* FIX(0.765366865) */
  2839. +#define FIX_0_899976223 7373 /* FIX(0.899976223) */
  2840. +#define FIX_1_175875602 9633 /* FIX(1.175875602) */
  2841. +#define FIX_1_501321110 12299 /* FIX(1.501321110) */
  2842. +#define FIX_1_847759065 15137 /* FIX(1.847759065) */
  2843. +#define FIX_1_961570560 16069 /* FIX(1.961570560) */
  2844. +#define FIX_2_053119869 16819 /* FIX(2.053119869) */
  2845. +#define FIX_2_562915447 20995 /* FIX(2.562915447) */
  2846. +#define FIX_3_072711026 25172 /* FIX(3.072711026) */
  2847. +
  2848. +
  2849. +/*
  2850. + * Perform an integer forward DCT on one block of samples.
  2851. + */
  2852. +
  2853. +//void
  2854. +//fdct_int32(short *const block)
  2855. +//{
  2856. +// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  2857. +// int tmp10, tmp11, tmp12, tmp13;
  2858. +// int z1, z2, z3, z4, z5;
  2859. +// short *blkptr;
  2860. +// int *dataptr;
  2861. +// int data[64];
  2862. +// int i;
  2863. +//
  2864. +// /* Pass 1: process rows. */
  2865. +// /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2866. +// /* furthermore, we scale the results by 2**PASS1_BITS. */
  2867. +//
  2868. +// dataptr = data;
  2869. +// blkptr = block;
  2870. +
  2871. + .text
  2872. +fdct_avr32:
  2873. + pushm r0-r3, r4-r7, lr
  2874. +#define loop_ctr r0
  2875. +#define blkptr r12
  2876. +#define x0 r1
  2877. +#define x1 r2
  2878. +#define x2 r3
  2879. +#define x3 r4
  2880. +#define x4 r5
  2881. +#define x5 r6
  2882. +#define x6 r7
  2883. +#define x7 r8
  2884. +#define tmp0 r5
  2885. +#define tmp7 r2
  2886. +#define tmp1 r3
  2887. +#define tmp6 r4
  2888. +#define tmp2 r9
  2889. +#define tmp5 r8
  2890. +#define tmp3 r7
  2891. +#define tmp4 r6
  2892. +
  2893. +
  2894. + mov loop_ctr, 8
  2895. +// for (i = 0; i < 8; i++) {
  2896. +ROW_LOOP:
  2897. +
  2898. + ldm blkptr, r1, r2, r3, r4
  2899. +
  2900. +// tmp2 = blkptr[2] + blkptr[5];
  2901. +// tmp3 = blkptr[3] + blkptr[4];
  2902. + paddx.h r5, r3, r2
  2903. +// tmp5 = blkptr[2] - blkptr[5];
  2904. +// tmp4 = blkptr[3] - blkptr[4];
  2905. + psubx.h r6, r3, r2
  2906. +// tmp0 = blkptr[0] + blkptr[7];
  2907. +// tmp1 = blkptr[1] + blkptr[6];
  2908. + paddx.h r2, r4, r1
  2909. +// tmp7 = blkptr[0] - blkptr[7];
  2910. +// tmp6 = blkptr[1] - blkptr[6];
  2911. + psubx.h r3, r4, r1
  2912. +
  2913. +// /* Even part per LL&M figure 1 --- note that published figure is faulty;
  2914. +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  2915. +// */
  2916. +
  2917. +#define tmp10 r1
  2918. +#define tmp13 r5
  2919. +#define tmp11 r7
  2920. +#define tmp12 r3
  2921. +#define z1 r9
  2922. +
  2923. +// tmp10 = tmp0 + tmp3;
  2924. +// tmp13 = tmp0 - tmp3;
  2925. + paddsub.h r1, r2:t, r5:b
  2926. +// tmp11 = tmp1 + tmp2;
  2927. +// tmp12 = tmp1 - tmp2;
  2928. + paddsub.h r4, r2:b, r5:t
  2929. +
  2930. +
  2931. +// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
  2932. +// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;
  2933. + paddsub.h r7, r1:t, r4:t
  2934. + ld.w r10, pc[const_table - .]
  2935. + plsl.h r7, r7, PASS1_BITS
  2936. +
  2937. +// z1 = (tmp12 + tmp13) * FIX_0_541196100;
  2938. + addhh.w r8, r4:b, r1:b
  2939. + mulhh.w r8, r8:b, r10:t
  2940. +
  2941. +// dataptr[2] =
  2942. +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
  2943. +// dataptr[6] =
  2944. +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);
  2945. + mulhh.w r9, r1:b, r10:b
  2946. + ld.w r10, pc[const_table - . + 4]
  2947. + add r1, r8, r9
  2948. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  2949. +
  2950. + mulhh.w r9, r4:b, r10:t
  2951. + add r4, r8, r9
  2952. + satrnds r4 >> (CONST_BITS - PASS1_BITS), 31
  2953. +
  2954. +
  2955. +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  2956. +// * cK represents cos(K*pi/16).
  2957. +// * i0..i3 in the paper are tmp4..tmp7 here.
  2958. +// */
  2959. +
  2960. +#define z2 r5
  2961. +#define z3 r6
  2962. +#define z4 r7
  2963. +#define z5 r8
  2964. +
  2965. +// z4 = tmp5 + tmp7;
  2966. +// z3 = tmp4 + tmp6;
  2967. + padd.h r2, r6, r3
  2968. +// z2 = tmp5 + tmp6;
  2969. +// z1 = tmp4 + tmp7;
  2970. + paddx.h r5, r6, r3
  2971. +
  2972. + lddpc r9, pc[const_table - . + 8]
  2973. +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
  2974. + addhh.w r8, r2:t, r2:b
  2975. + mulhh.w r8, r8:b, r10:b
  2976. + lddpc r10, pc[const_table - . + 12]
  2977. +
  2978. +
  2979. +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
  2980. + mulhh.w r11, r6:b, r9:t
  2981. +
  2982. +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
  2983. + mulhh.w r6, r6:t, r9:b
  2984. +
  2985. +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
  2986. + lddpc r9, pc[const_table - . + 20]
  2987. + mulhh.w lr, r3:b, r10:t
  2988. +
  2989. +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
  2990. + mulhh.w r3, r3:t, r10:b
  2991. +
  2992. +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
  2993. + mulhh.w r10, r2:b, r9:t
  2994. +
  2995. +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
  2996. + mulhh.w r2, r2:t, r9:b
  2997. + lddpc r9, pc[const_table - . + 16]
  2998. +// z3 += z5;
  2999. +// z4 += z5;
  3000. + add r10, r8
  3001. + add r2, r8
  3002. +
  3003. +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
  3004. + mulhh.w r8, r5:b, r9:t
  3005. +
  3006. +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
  3007. + mulhh.w r5, r5:t, r9:b
  3008. +
  3009. +// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
  3010. + add r11, r8
  3011. + add r11, r10
  3012. + satrnds r11 >> (CONST_BITS - PASS1_BITS), 31
  3013. +
  3014. +// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
  3015. + add r6, r5
  3016. +
  3017. + sthh.w blkptr[6*2], r4:b, r11:b
  3018. + add r6, r2
  3019. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  3020. +
  3021. +// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
  3022. + add lr, r5
  3023. + sthh.w blkptr[4*2], r7:b, r6:b
  3024. + add lr, r10
  3025. + satrnds lr >> (CONST_BITS - PASS1_BITS), 31
  3026. +
  3027. +// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
  3028. + add r3, r8
  3029. + sthh.w blkptr[2*2], r1:b, lr:b
  3030. + add r3, r2
  3031. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  3032. +
  3033. +
  3034. +
  3035. +// dataptr += 8; /* advance pointer to next row */
  3036. +// blkptr += 8;
  3037. + sthh.w blkptr[0], r7:t, r3:b
  3038. + sub blkptr, -16
  3039. + sub loop_ctr, 1
  3040. + brne ROW_LOOP
  3041. +
  3042. +// }
  3043. +
  3044. + /* Pass 2: process columns.
  3045. + * We remove the PASS1_BITS scaling, but leave the results scaled up
  3046. + * by an overall factor of 8.
  3047. + */
  3048. +
  3049. +// dataptr = data;
  3050. + sub blkptr, 128
  3051. +
  3052. + mov loop_ctr, 4
  3053. +// for (i = 0; i < 8; i++) {
  3054. +COLOUMN_LOOP:
  3055. + ld.w r1, blkptr[0]
  3056. + ld.w r2, blkptr[1*8*2]
  3057. + ld.w r3, blkptr[2*8*2]
  3058. + ld.w r4, blkptr[3*8*2]
  3059. + ld.w r5, blkptr[4*8*2]
  3060. + ld.w r6, blkptr[5*8*2]
  3061. + ld.w r7, blkptr[6*8*2]
  3062. + ld.w r8, blkptr[7*8*2]
  3063. +
  3064. +// tmp0 = blkptr[0] + blkptr[7*8];
  3065. + padds.sh r9, r1, r8
  3066. +// tmp7 = blkptr[0] - blkptr[7*8];
  3067. + psubs.sh r1, r1, r8
  3068. +// tmp1 = blkptr[1*8] + blkptr[6*8];
  3069. + padds.sh r8, r2, r7
  3070. +// tmp6 = blkptr[1*8] - blkptr[6*8];
  3071. + psubs.sh r2, r2, r7
  3072. +// tmp2 = blkptr[2*8] + blkptr[5*8];
  3073. + padds.sh r7, r3, r6
  3074. +// tmp5 = blkptr[2*8] - blkptr[5*8];
  3075. + psubs.sh r3, r3, r6
  3076. +// tmp3 = blkptr[3*8] + blkptr[4*8];
  3077. + padds.sh r6, r4, r5
  3078. +// tmp4 = blkptr[3*8] - blkptr[4*8];
  3079. + psubs.sh r4, r4, r5
  3080. +
  3081. +// /* even part per ll&m figure 1 --- note that published figure is faulty;
  3082. +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  3083. +// */
  3084. +//
  3085. +// tmp10 = tmp0 + tmp3;
  3086. + padds.sh r5, r9, r6
  3087. +// tmp13 = tmp0 - tmp3;
  3088. + psubs.sh r9, r9, r6
  3089. +// tmp11 = tmp1 + tmp2;
  3090. + padds.sh r6, r8, r7
  3091. +// tmp12 = tmp1 - tmp2;
  3092. + psubs.sh r8, r8, r7
  3093. +
  3094. +// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS);
  3095. +// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS);
  3096. +//Might get an overflow here
  3097. + padds.sh r7, r5, r6
  3098. + psubs.sh r5, r5, r6
  3099. +
  3100. + //Rounding
  3101. + mov lr, (1 << (PASS1_BITS + 2))
  3102. + orh lr, hi(1 << (16 + PASS1_BITS + 2))
  3103. + padds.sh r7, r7, lr
  3104. + padds.sh r5, r5, lr
  3105. +
  3106. + pasr.h r7, r7, PASS1_BITS + 3
  3107. + pasr.h r5, r5, PASS1_BITS + 3
  3108. + st.w r12[0], r7
  3109. + st.w r12[4*8*2], r5
  3110. +
  3111. + lddpc r10, const_table2
  3112. +
  3113. +
  3114. +// z1 = (tmp12 + tmp13) * FIX_0_541196100;
  3115. + padds.sh r5, r8, r9
  3116. + mulhh.w r6, r5:t, r10:t
  3117. + mulhh.w r7, r5:b, r10:t
  3118. +
  3119. +// dataptr[16] =
  3120. +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
  3121. + lddpc r11, const_table2 + 4
  3122. + mulhh.w lr, r9:t, r10:b
  3123. + mulhh.w r9, r9:b, r10:b
  3124. + add lr, r6
  3125. + add r9, r7
  3126. + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
  3127. + satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31
  3128. + sthh.w r12[2*8*2], lr:b, r9:b
  3129. +
  3130. +// dataptr[48] =
  3131. +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);
  3132. + mulhh.w lr, r8:t, r11:t
  3133. + mulhh.w r8, r8:b, r11:t
  3134. + add lr, r6
  3135. + add r8, r7
  3136. + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
  3137. + satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31
  3138. + sthh.w r12[6*8*2], lr:b, r8:b
  3139. +
  3140. +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  3141. +// * cK represents cos(K*pi/16).
  3142. +// * i0..i3 in the paper are tmp4..tmp7 here.
  3143. +// */
  3144. +//
  3145. +// z2 = tmp5 + tmp6;
  3146. +// z3 = tmp4 + tmp6;
  3147. +// z4 = tmp5 + tmp7;
  3148. + padds.sh r5, r3, r2
  3149. + padds.sh r6, r4, r2
  3150. + padds.sh r7, r3, r1
  3151. +
  3152. +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
  3153. + padds.sh r8, r6, r7
  3154. + mulhh.w r9, r8:t, r11:b
  3155. + mulhh.w r8, r8:b, r11:b
  3156. +
  3157. +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
  3158. +// z3 += z5;
  3159. + lddpc r11, const_table2 + 8
  3160. + mulhh.w r10, r6:t, r11:t
  3161. + mulhh.w r6, r6:b, r11:t
  3162. + add r10, r9
  3163. + add r6, r8
  3164. +
  3165. +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
  3166. +// z4 += z5;
  3167. + mulhh.w lr, r7:t, r11:b
  3168. + mulhh.w r7, r7:b, r11:b
  3169. + lddpc r11, const_table2 + 12
  3170. + st.w --sp,r0
  3171. + add lr, r9
  3172. + add r7, r8
  3173. +
  3174. +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
  3175. + mulhh.w r0, r2:t, r11:t
  3176. + machh.w r0, r5:t, r11:b
  3177. + mulhh.w r2, r2:b, r11:t
  3178. + machh.w r2, r5:b, r11:b
  3179. +
  3180. +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
  3181. +// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
  3182. + add r0, r10
  3183. + lddpc r11, const_table2 + 16
  3184. + add r2, r6
  3185. + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
  3186. + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
  3187. + sthh.w r12[3*8*2], r0:b, r2:b
  3188. +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
  3189. + mulhh.w r0, r3:t, r11:t
  3190. + machh.w r0, r5:t, r11:b
  3191. + mulhh.w r2, r3:b, r11:t
  3192. + machh.w r2, r5:b, r11:b
  3193. + add r0, lr
  3194. + lddpc r11, const_table2 + 20
  3195. + add r2, r7
  3196. +
  3197. +// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
  3198. + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
  3199. + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
  3200. + sthh.w r12[5*8*2], r0:b, r2:b
  3201. +
  3202. +
  3203. +// z1 = tmp4 + tmp7;
  3204. + padds.sh r2, r4, r1
  3205. +
  3206. +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
  3207. + mulhh.w r3, r4:t, r11:t
  3208. + machh.w r3, r2:t, r11:b
  3209. + mulhh.w r4, r4:b, r11:t
  3210. + machh.w r4, r2:b, r11:b
  3211. + add r3, r10
  3212. + lddpc r11, const_table2 + 24
  3213. + add r4, r6
  3214. +
  3215. +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
  3216. +// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
  3217. + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
  3218. + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
  3219. + sthh.w r12[7*8*2], r3:b, r4:b
  3220. +
  3221. +
  3222. +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
  3223. + mulhh.w r3, r1:t, r11:t
  3224. + machh.w r3, r2:t, r11:b
  3225. + mulhh.w r4, r1:b, r11:t
  3226. + machh.w r4, r2:b, r11:b
  3227. + add r3, lr
  3228. + add r4, r7
  3229. +
  3230. +// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
  3231. + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
  3232. + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
  3233. + sthh.w r12[1*8*2], r3:b, r4:b
  3234. + ld.w r0, sp++
  3235. +
  3236. +// dataptr++; /* advance pointer to next column */
  3237. + sub blkptr, -4
  3238. + sub loop_ctr, 1
  3239. + brne COLOUMN_LOOP
  3240. +
  3241. +// }
  3242. +
  3243. + popm r0-r3, r4-r7, pc
  3244. +
  3245. +// /* descale */
  3246. +// for (i = 0; i < 64; i++)
  3247. +// block[i] = (short int) DESCALE(data[i], 3);
  3248. +
  3249. +
  3250. +//}
  3251. +
  3252. +
  3253. + .align 2
  3254. +const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
  3255. + .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110
  3256. + .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644
  3257. +
  3258. +const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
  3259. + .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447
  3260. + .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223
  3261. + .short FIX_1_501321110, -FIX_0_899976223
  3262. +
  3263. +
  3264. +
  3265. +
  3266. --- /dev/null
  3267. +++ b/libavcodec/avr32/h264idct.S
  3268. @@ -0,0 +1,451 @@
  3269. +/*
  3270. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  3271. + *
  3272. + * Redistribution and use in source and binary forms, with or without
  3273. + * modification, are permitted provided that the following conditions
  3274. + * are met:
  3275. + *
  3276. + * 1. Redistributions of source code must retain the above copyright
  3277. + * notice, this list of conditions and the following disclaimer.
  3278. + *
  3279. + * 2. Redistributions in binary form must reproduce the above
  3280. + * copyright notice, this list of conditions and the following
  3281. + * disclaimer in the documentation and/or other materials provided
  3282. + * with the distribution.
  3283. + *
  3284. + * 3. The name of ATMEL may not be used to endorse or promote products
  3285. + * derived from this software without specific prior written
  3286. + * permission.
  3287. + *
  3288. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  3289. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  3290. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  3291. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  3292. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  3293. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  3294. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  3295. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  3296. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  3297. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  3298. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  3299. + * DAMAGE.
  3300. + */
  3301. +
  3302. + .global h264_idct_add_avr32
  3303. +
  3304. + /* Macro for performing the 1-D transform on one row line.
  3305. +
  3306. + The register 'w01' should contain the first two pixels,
  3307. + and the register 'w23' should contain the last two pixels
  3308. + in the line. The resulting line is placed in p01 and p23
  3309. + so that { w01, w23 } = { x0, x1, x3, x2 }.
  3310. + 'tmp' and 'tmp2' should be scratchpad registers. */
  3311. + .macro transform_row w01, w23, tmp, tmp2
  3312. + add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */
  3313. + sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */
  3314. + bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */
  3315. + pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */
  3316. + paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */
  3317. + padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */
  3318. + psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */
  3319. + .endm
  3320. +
  3321. + /* Macro for performing the 1-D transform on two columns.
  3322. +
  3323. + The registers w0, w1, w2, w3 should each contain two
  3324. + packed samples from the two colomns to transform.
  3325. + tmp and tmp2 are scratchpad registers.
  3326. +
  3327. + The resulting transformed columns are placed in the
  3328. + same positions as the input columns.
  3329. + */
  3330. + .macro transform_2columns w0, w1, w2, w3, tmp, tmp2
  3331. + padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */
  3332. + psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */
  3333. + pasr.h \w2, \w1, 1 /* w2 = w1/2 */
  3334. + pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */
  3335. + psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */
  3336. + padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */
  3337. + padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */
  3338. + psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */
  3339. + padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */
  3340. + psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */
  3341. + /* Scale down result. */
  3342. + pasr.h \w0, \w0, 6
  3343. + pasr.h \w1, \w1, 6
  3344. + pasr.h \w2, \w2, 6
  3345. + pasr.h \w3, \w3, 6
  3346. + .endm
  3347. +
  3348. +/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/
  3349. +
  3350. +h264_idct_add_avr32:
  3351. +
  3352. + stm --sp,r0-r3,r4-r7, lr
  3353. +
  3354. + /* Setup rounding factor. */
  3355. + mov r0, (1 << 5)
  3356. + lsl r0, 16
  3357. +
  3358. + /* Load block */
  3359. + ldm r11,r2-r9
  3360. + /* r9 = { w00, w01 },
  3361. + r8 = { w02, w03 },
  3362. + r7 = { w10, w11 },
  3363. + r6 = { w12, w13 },
  3364. + r5 = { w20, w21 },
  3365. + r4 = { w22, w23 },
  3366. + r3 = { w30, w31 },
  3367. + r2 = { w32, w33 } */
  3368. +
  3369. +
  3370. + /* Add the rounding factor to w00. */
  3371. + add r9, r0
  3372. +
  3373. + /* Transform rows */
  3374. + transform_row r9, r8, r0, r1
  3375. + transform_row r7, r6, r0, r1
  3376. + transform_row r5, r4, r0, r1
  3377. + transform_row r3, r2, r0, r1
  3378. +
  3379. + /* Transform columns */
  3380. + transform_2columns r9, r7, r5, r3, r0, r1
  3381. + transform_2columns r8, r6, r4, r2, r0, r1
  3382. +
  3383. + /* Load predicted pixels.*/
  3384. + ld.w lr, r12[0]
  3385. + ld.w r11, r12[r10]
  3386. +
  3387. + /* Unpack to halwords. */
  3388. + punpckub.h r0, lr:t
  3389. + punpckub.h r1, lr:b
  3390. +
  3391. + /* Add with transformed row. */
  3392. + padd.h r0, r0, r9
  3393. + paddx.h r1, r1, r8
  3394. + /* Pack and saturate back to 8-bit pixels. */
  3395. + packsh.ub r0, r0, r1
  3396. +
  3397. + /* Unpack to halwords. */
  3398. + punpckub.h lr, r11:t
  3399. + punpckub.h r11, r11:b
  3400. +
  3401. + /* Add with transformed row. */
  3402. + padd.h lr, lr, r7
  3403. + paddx.h r11, r11, r6
  3404. + /* Pack and saturate back to 8-bit pixels. */
  3405. + packsh.ub r1, lr, r11
  3406. +
  3407. + /* Store back to frame. */
  3408. + st.w r12[0], r0
  3409. + st.w r12[r10], r1
  3410. +
  3411. + add r12, r12, r10 << 1
  3412. +
  3413. + /* Load predicted pixels.*/
  3414. + ld.w lr, r12[0]
  3415. + ld.w r11, r12[r10]
  3416. +
  3417. + /* Unpack to halwords. */
  3418. + punpckub.h r0, lr:t
  3419. + punpckub.h r1, lr:b
  3420. +
  3421. + /* Add with transformed row. */
  3422. + padd.h r0, r0, r5
  3423. + paddx.h r1, r1, r4
  3424. + /* Pack and saturate back to 8-bit pixels. */
  3425. + packsh.ub r0, r0, r1
  3426. +
  3427. + /* Unpack to halwords. */
  3428. + punpckub.h lr, r11:t
  3429. + punpckub.h r11, r11:b
  3430. +
  3431. + /* Add with transformed row. */
  3432. + padd.h lr, lr, r3
  3433. + paddx.h r11, r11, r2
  3434. + /* Pack and saturate back to 8-bit pixels. */
  3435. + packsh.ub r1, lr, r11
  3436. +
  3437. + /* Store back to frame. */
  3438. + st.w r12[0], r0
  3439. + st.w r12[r10], r1
  3440. +
  3441. + ldm sp++,r0-r3,r4-r7, pc
  3442. +
  3443. +
  3444. + .global h264_idct8_add_avr32
  3445. +//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
  3446. +
  3447. +h264_idct8_add_avr32:
  3448. + stm --sp,r0-r3,r4-r7, lr
  3449. +
  3450. + /* Push dst and stride on stack */
  3451. + stm --sp,r10,r12
  3452. +
  3453. +// int i;
  3454. +// DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
  3455. +// uint8_t *cm = cropTbl + MAX_NEG_CROP;
  3456. +
  3457. +// block[0] += 32;
  3458. +
  3459. +
  3460. +// for( i = 0; i < 8; i++ )
  3461. +// {
  3462. + mov lr, 4
  3463. +0:
  3464. + ld.w r7, r11[0*(8*2)]
  3465. + ld.w r6, r11[1*(8*2)]
  3466. + ld.w r5, r11[2*(8*2)]
  3467. + ld.w r4, r11[3*(8*2)]
  3468. + ld.w r3, r11[4*(8*2)]
  3469. + ld.w r2, r11[5*(8*2)]
  3470. + ld.w r1, r11[6*(8*2)]
  3471. + ld.w r0, r11[7*(8*2)]
  3472. +
  3473. +/*
  3474. +
  3475. + const int a0 = src[0][i] + src[4][i];
  3476. + const int a2 = src[0][i] - src[4][i];
  3477. + const int a4 = (src[2][i]>>1) - src[6][i];
  3478. + const int a6 = (src[6][i]>>1) + src[2][i];
  3479. +*/
  3480. + padd.h r8, r7, r3 /* r8 = a0 */
  3481. + psub.h r7, r7, r3 /* r7 = a2 */
  3482. + pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */
  3483. + pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */
  3484. + psub.h r3, r3, r1 /* r3 = a4 */
  3485. + padd.h r9, r9, r5 /* r9 = a6 */
  3486. +
  3487. +/*
  3488. + const int b0 = a0 + a6;
  3489. + const int b2 = a2 + a4;
  3490. + const int b4 = a2 - a4;
  3491. + const int b6 = a0 - a6;
  3492. +*/
  3493. + padd.h r1, r8, r9 /* r1 = b0 */
  3494. + psub.h r8, r8, r9 /* r8 = b6 */
  3495. + padd.h r5, r7, r3 /* r5 = b2 */
  3496. + psub.h r7, r7, r3 /* r7 = b4 */
  3497. +
  3498. +/*
  3499. + const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
  3500. + const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
  3501. + const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
  3502. + const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
  3503. +*/
  3504. + pasr.h r3, r0, 1
  3505. + padd.h r3, r3, r0
  3506. + psub.h r3, r2, r3
  3507. + psub.h r3, r3, r4 /* r3 = a1 */
  3508. +
  3509. + pasr.h r9, r4, 1
  3510. + padd.h r9, r9, r4
  3511. + psub.h r9, r0, r9
  3512. + padd.h r9, r6, r9 /* r9 = a3 */
  3513. +
  3514. + pasr.h r10, r2, 1
  3515. + padd.h r10, r10, r2
  3516. + padd.h r10, r10, r0
  3517. + psub.h r10, r10, r6 /* r10 = a5 */
  3518. +
  3519. + pasr.h r0, r6, 1
  3520. + padd.h r0, r0, r6
  3521. + padd.h r0, r0, r2
  3522. + padd.h r0, r0, r4 /* r0 = a7 */
  3523. +/*
  3524. + const int b1 = (a7>>2) + a1;
  3525. + const int b3 = a3 + (a5>>2);
  3526. + const int b5 = (a3>>2) - a5;
  3527. + const int b7 = a7 - (a1>>2);
  3528. +*/
  3529. + pasr.h r2, r0, 2
  3530. + padd.h r2, r2, r3 /* r2 = b1 */
  3531. + pasr.h r3, r3, 2
  3532. + psub.h r3, r0, r3 /* r3 = b7 */
  3533. +
  3534. + pasr.h r0, r10, 2
  3535. + padd.h r0, r0, r9 /* r0 = b3 */
  3536. + pasr.h r9, r9, 2
  3537. + psub.h r9, r9, r10 /* r9 = b5 */
  3538. +
  3539. +
  3540. +/*
  3541. + src[0][i] = b0 + b7;
  3542. + src[7][i] = b0 - b7;
  3543. + src[1][i] = b2 + b5;
  3544. + src[6][i] = b2 - b5;
  3545. + src[2][i] = b4 + b3;
  3546. + src[5][i] = b4 - b3;
  3547. + src[3][i] = b6 + b1;
  3548. + src[4][i] = b6 - b1; */
  3549. +
  3550. + padd.h r4, r1, r3
  3551. + psub.h r1, r1, r3
  3552. + st.w r11[0*(8*2)], r4
  3553. + st.w r11[7*(8*2)], r1
  3554. +
  3555. + padd.h r3, r5, r9
  3556. + psub.h r5, r5, r9
  3557. + st.w r11[1*(8*2)], r3
  3558. + st.w r11[6*(8*2)], r5
  3559. +
  3560. + padd.h r9, r7, r0
  3561. + psub.h r7, r7, r0
  3562. + st.w r11[2*(8*2)], r9
  3563. + st.w r11[5*(8*2)], r7
  3564. +
  3565. + padd.h r0, r8, r2
  3566. + psub.h r8, r8, r2
  3567. + st.w r11[3*(8*2)], r0
  3568. + st.w r11[4*(8*2)], r8
  3569. +
  3570. + sub r11, -4
  3571. + sub lr, 1
  3572. + brne 0b
  3573. +
  3574. +// }
  3575. +
  3576. + lddsp r12, sp[0] /* r12 = dst */
  3577. + sub r11, 4*4
  3578. + ldm r11++, r4-r7
  3579. + mov lr, 8
  3580. + /* Push dst and stride on stack */
  3581. +
  3582. +1:
  3583. +// for( i = 0; i < 8; i++ )
  3584. +// {
  3585. +
  3586. + /* r7 = {src[i][0], src[i][1]}
  3587. + r6 = {src[i][2], src[i][3]}
  3588. + r5 = {src[i][4], src[i][5]}
  3589. + r4 = {src[i][6], src[i][7]} */
  3590. +
  3591. +/*
  3592. + const int a0 = src[i][0] + src[i][4];
  3593. + const int a2 = src[i][0] - src[i][4];
  3594. + const int a4 = (src[i][2]>>1) - src[i][6];
  3595. + const int a6 = (src[i][6]>>1) + src[i][2];
  3596. +*/
  3597. + pasr.h r8, r6, 1
  3598. + pasr.h r9, r4, 1
  3599. + addhh.w r0, r7:t, r5:t /* r0 = a0 */
  3600. + subhh.w r1, r7:t, r5:t /* r1 = a2 */
  3601. + subhh.w r2, r8:t, r4:t /* r2 = a4 */
  3602. + addhh.w r3, r9:t, r6:t /* r3 = a6 */
  3603. +
  3604. +/*
  3605. + const int b0 = a0 + a6;
  3606. + const int b2 = a2 + a4;
  3607. + const int b4 = a2 - a4;
  3608. + const int b6 = a0 - a6;
  3609. +*/
  3610. + add r10, r0, r3 /* r10 = b0 */
  3611. + sub r0, r3 /* r0 = b6 */
  3612. + add r3, r1, r2 /* r3 = b2 */
  3613. + sub r1, r2 /* r1 = b4 */
  3614. +/*
  3615. +
  3616. +
  3617. + const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1);
  3618. + const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1);
  3619. + const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1);
  3620. + const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */
  3621. + addhh.w r8, r8:b, r6:b
  3622. + addhh.w r2, r4:b, r7:b
  3623. + sub r2, r8 /* r2 = a3 */
  3624. +
  3625. + addhh.w r9, r9:b, r4:b
  3626. + subhh.w r8, r5:b, r6:b
  3627. + sub r8, r9 /* r8 = a1 */
  3628. +
  3629. + pasr.h r9, r7, 1
  3630. + addhh.w r9, r9:b, r7:b
  3631. + addhh.w r6, r5:b, r6:b
  3632. + add r6, r9 /* r6 = a7 */
  3633. +
  3634. + pasr.h r9, r5, 1
  3635. + addhh.w r9, r9:b, r5:b
  3636. + subhh.w r5, r4:b, r7:b
  3637. + add r5, r9 /* r5 = a5 */
  3638. +
  3639. +/* const int b1 = (a7>>2) + a1;
  3640. + const int b3 = (a5>>2) + a3;
  3641. + const int b5 = (a3>>2) - a5;
  3642. + const int b7 = -(a1>>2) + a7 ; */
  3643. + asr r4, r6, 2
  3644. + add r4, r8 /* r4 = b1 */
  3645. + asr r8, 2
  3646. + rsub r8, r6 /* r8 = b7 */
  3647. +
  3648. + asr r6, r5, 2
  3649. + add r6, r2 /* r6 = b3 */
  3650. + asr r2, 2
  3651. + sub r2, r5 /* r2 = b5 */
  3652. +
  3653. +/*
  3654. + dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ];
  3655. + dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ];
  3656. + dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ];
  3657. + dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ];
  3658. + dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ];
  3659. + dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ];
  3660. + dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ];
  3661. + dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ];
  3662. +*/
  3663. + add r5, r10, r8
  3664. + satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */
  3665. + sub r10, r8
  3666. + satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */
  3667. + add r8, r3, r2
  3668. + satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */
  3669. + sub r3, r2
  3670. + satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */
  3671. +
  3672. + add r2, r1, r6
  3673. + satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */
  3674. + sub r1, r6
  3675. + satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */
  3676. +
  3677. + add r6, r0, r4
  3678. + satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */
  3679. + sub r0, r4
  3680. + satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */
  3681. +
  3682. + ld.w r4, r12[0]
  3683. +
  3684. + packw.sh r8, r5, r8
  3685. + packw.sh r7, r2, r6
  3686. + ld.w r9, r12[4]
  3687. + packw.sh r6, r0, r1
  3688. + packw.sh r5, r3, r10
  3689. +
  3690. + punpckub.h r10, r4:t
  3691. + punpckub.h r4, r4:b
  3692. + punpckub.h r3, r9:t
  3693. + punpckub.h r9, r9:b
  3694. +
  3695. + padd.h r8, r8, r10
  3696. + padd.h r7, r7, r4
  3697. + padd.h r6, r6, r3
  3698. + padd.h r5, r5, r9
  3699. +
  3700. + lddsp r10, sp[4] /* r10 = stride */
  3701. + packsh.ub r0, r8, r7
  3702. + packsh.ub r1, r6, r5
  3703. +
  3704. + st.w r12[0], r0
  3705. + st.w r12[4], r1
  3706. +
  3707. + ldm r11++, r4-r7
  3708. + add r12, r10 /* dst += stride */
  3709. +
  3710. + sub lr, 1
  3711. + brne 1b
  3712. +
  3713. + sub sp, -8
  3714. + ldm sp++,r0-r3,r4-r7, pc
  3715. +
  3716. +
  3717. +
  3718. +// }
  3719. +//}
  3720. --- /dev/null
  3721. +++ b/libavcodec/avr32/idct.S
  3722. @@ -0,0 +1,829 @@
  3723. +/*
  3724. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  3725. + *
  3726. + * Redistribution and use in source and binary forms, with or without
  3727. + * modification, are permitted provided that the following conditions
  3728. + * are met:
  3729. + *
  3730. + * 1. Redistributions of source code must retain the above copyright
  3731. + * notice, this list of conditions and the following disclaimer.
  3732. + *
  3733. + * 2. Redistributions in binary form must reproduce the above
  3734. + * copyright notice, this list of conditions and the following
  3735. + * disclaimer in the documentation and/or other materials provided
  3736. + * with the distribution.
  3737. + *
  3738. + * 3. The name of ATMEL may not be used to endorse or promote products
  3739. + * derived from this software without specific prior written
  3740. + * permission.
  3741. + *
  3742. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  3743. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  3744. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  3745. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  3746. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  3747. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  3748. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  3749. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  3750. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  3751. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  3752. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  3753. + * DAMAGE.
  3754. + */
  3755. +
  3756. + .global idct_add_avr32
  3757. + .global idct_put_avr32
  3758. + .global idct_avr32
  3759. +
  3760. +
  3761. +#define CONST_BITS 13
  3762. +#define PASS1_BITS 2
  3763. +
  3764. +#define ONE ((INT32) 1)
  3765. +
  3766. +#define CONST_SCALE (ONE << CONST_BITS)
  3767. +
  3768. +#define LINE_SIZE 32
  3769. +
  3770. +#define FIX_0_298631336 (2446) /* FIX(0.298631336) */
  3771. +#define FIX_0_390180644 (3196) /* FIX(0.390180644) */
  3772. +#define FIX_0_541196100 (4433) /* FIX(0.541196100) */
  3773. +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
  3774. +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
  3775. +#define FIX_1_175875602 (9633) /* FIX(1.175875602) */
  3776. +#define FIX_1_501321110 (12299)/* FIX(1.501321110) */
  3777. +#define FIX_1_847759065 (15137)/* FIX(1.847759065) */
  3778. +#define FIX_1_961570560 (16069)/* FIX(1.961570560) */
  3779. +#define FIX_2_053119869 (16819)/* FIX(2.053119869) */
  3780. +#define FIX_2_562915447 (20995)/* FIX(2.562915447) */
  3781. +#define FIX_3_072711026 (25172)/* FIX(3.072711026) */
  3782. +
  3783. +
  3784. +#define loop_cnt r11
  3785. +
  3786. + .text
  3787. +
  3788. +idct_add_avr32:
  3789. + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
  3790. +
  3791. + // Give room for some variables on the stack
  3792. + sub sp, 8
  3793. + stdsp SP[0], r12 // rfp
  3794. + stdsp SP[4], r11 // iinc
  3795. +
  3796. + mov loop_cnt, 8 //Initialize loop counter
  3797. +
  3798. +FOR_ROW:
  3799. +
  3800. + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
  3801. + mov r6, 0
  3802. +#ifdef USE_PREFETCH
  3803. + pref r10[LINE_SIZE] //Prefetch next line
  3804. +#endif
  3805. + or r4, r2, r3 << 16
  3806. + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
  3807. + or r4, r0
  3808. + brne AC_ROW //If there are non-zero AC coeffisients perform row-transform
  3809. +
  3810. + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
  3811. + plsl.h r5, r5, PASS1_BITS
  3812. + mov r4, r5
  3813. + st.d r10++, r4
  3814. + st.d r10++, r4
  3815. +
  3816. + sub loop_cnt, 1 //Decrement loop counter
  3817. + brne FOR_ROW //Perform loop one more time if loop_cnt is not zero
  3818. +
  3819. + bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed
  3820. +
  3821. +
  3822. +AC_ROW:
  3823. +
  3824. +
  3825. + ld.w r12, pc[coef_table - .]
  3826. + ld.w r9, pc[coef_table - . + 4]
  3827. +
  3828. + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
  3829. + mulhh.w r5, r4:t, r12:t
  3830. + mulhh.w r6, r0:t, r12:b
  3831. + ld.w r12, pc[coef_table - . + 8]
  3832. + mulhh.w r7, r2:t, r9:t
  3833. + add r6, r5 // tmp2
  3834. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  3835. + add r7, r5 // tmp3
  3836. + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
  3837. +
  3838. + paddsub.h r5, r3:t, r1:t
  3839. + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
  3840. +
  3841. + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
  3842. + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
  3843. +
  3844. +
  3845. + addhh.w lr, r3:b, r1:b // lr = z4
  3846. + addhh.w r5, r4:b, lr:b
  3847. + mulhh.w r5, r5:b, r9:b // r5 = z5
  3848. +
  3849. + ld.w r9, pc[coef_table - . + 12]
  3850. + mulhh.w r4, r4:b, r12:t // r4 = z3
  3851. + mulhh.w lr, lr:b, r12:b // lr = z4
  3852. +
  3853. + add r4, r5
  3854. + add lr, r5
  3855. +
  3856. + addhh.w r5, r2:b, r1:b // r5 = z2
  3857. + addhh.w r8, r3:b, r0:b // r8 = z1
  3858. +
  3859. +
  3860. + mulhh.w r0, r0:b, r9:t // r0 = tmp0
  3861. + ld.w r12, pc[coef_table - . + 16]
  3862. + mulhh.w r1, r1:b, r9:b // r1 = tmp1
  3863. + ld.w r9, pc[coef_table - . + 20]
  3864. + mulhh.w r2, r2:b, r12:t // r2 = tmp2
  3865. + mulhh.w r3, r3:b, r12:b // r3 = tmp3
  3866. + mulhh.w r8, r8:b, r9:t // r8 = z1
  3867. + mulhh.w r5, r5:b, r9:b // r5 = z2
  3868. +
  3869. +
  3870. + add r0, r8
  3871. + add r0, r4
  3872. + add r1, r5
  3873. + add r1, lr
  3874. + add r2, r5
  3875. + add r2, r4
  3876. + add r3, r8
  3877. + add r3, lr
  3878. +
  3879. + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
  3880. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  3881. + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
  3882. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  3883. +
  3884. + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
  3885. + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
  3886. + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
  3887. + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
  3888. +
  3889. + sthh.w r10[0], r4:t, r5:t
  3890. + sthh.w r10[4], r3:t, r2:t
  3891. + sthh.w r10[8], r2:b, r3:b
  3892. + sthh.w r10[12], r5:b, r4:b
  3893. +
  3894. +
  3895. +
  3896. + sub r10, -16
  3897. + sub loop_cnt, 1
  3898. + brne FOR_ROW, e
  3899. +
  3900. +COLOUMN_TRANSFORM:
  3901. +
  3902. + sub r10, 128 //Set pointer to start of DCT block
  3903. +
  3904. +
  3905. + mov loop_cnt, 8
  3906. +FOR_COLOUMN:
  3907. + ldins.h r3:t,r10[0] // r3:t = dataptr[0]
  3908. + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
  3909. + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
  3910. + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
  3911. + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
  3912. + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
  3913. + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
  3914. + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
  3915. +
  3916. + or r4, r1, r3 << 16
  3917. + or r4, r2
  3918. + or r4, r0
  3919. + brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform
  3920. +
  3921. + lddsp r12, SP[0] // rfp
  3922. + lddsp r9, SP[4] // iinc
  3923. + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9
  3924. + ld.d r0, r12[0]
  3925. + sub r10, -2 // Increment the dataptr
  3926. + bfins r3, r3, 16, 16
  3927. + punpckub.h r2, r1:t
  3928. + padd.h r2, r2, r3
  3929. + punpckub.h r1, r1:b
  3930. + padd.h r1, r1, r3
  3931. + packsh.ub r1, r2, r1
  3932. + punpckub.h r2, r0:t
  3933. + padd.h r2, r2, r3
  3934. + punpckub.h r0, r0:b
  3935. + padd.h r0, r0, r3
  3936. + packsh.ub r0, r2, r0
  3937. + st.d r12[0], r0
  3938. + add r12, r9 // increment rfp
  3939. + stdsp SP[0], r12
  3940. +
  3941. + sub loop_cnt, 1//Decrement loop counter
  3942. + brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero
  3943. +
  3944. + sub sp, -8
  3945. + popm r0-r3, r4-r7, pc//Pop back registers and PC
  3946. +
  3947. +AC_COLOUMN:
  3948. +
  3949. + ld.w r12, pc[coef_table - .]
  3950. + ld.w r9, pc[coef_table - . + 4]
  3951. +
  3952. + addhh.w r4, r2:t, r2:b
  3953. + mulhh.w r4, r4:b, r12:t // r4 = z1
  3954. + mulhh.w r5, r2:b, r12:b
  3955. + ld.w r12, pc[coef_table - . + 8]
  3956. + mulhh.w r6, r2:t, r9:t
  3957. + add r5, r4 // r5 = tmp2
  3958. + add r6, r4 // r6 = tmp3
  3959. +
  3960. + addhh.w r7, r3:t, r3:b
  3961. + subhh.w r8, r3:t, r3:b
  3962. +
  3963. + lsl r7, CONST_BITS
  3964. + lsl r8, CONST_BITS
  3965. +
  3966. + add r2, r7, r6 // r2 = tmp10
  3967. + sub r3, r7, r6 // r3 = tmp13
  3968. + add r4, r8, r5 // r4 = tmp11
  3969. + sub r5, r8, r5 // r5 = tmp12
  3970. +
  3971. + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
  3972. + addhh.w r7, r6:t, r6:b
  3973. + mulhh.w r7, r7:b, r9:b // r7 = z5
  3974. +
  3975. + ld.w r9, pc[coef_table - . + 12]
  3976. + mulhh.w r8, r6:b, r12:t // r8 = z3
  3977. + mulhh.w r6, r6:t, r12:b // r6 = z4
  3978. +
  3979. + add r8, r7
  3980. + add r6, r7
  3981. +
  3982. + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
  3983. +
  3984. + mulhh.w r12, r0:b, r9:t // r12 = tmp0
  3985. + mulhh.w r0, r0:t, r9:b // r0 = tmp1
  3986. + ld.w r9, pc[coef_table - . + 16]
  3987. + add r12, r8
  3988. + add r0, r6
  3989. +
  3990. + ld.w lr, pc[coef_table - . + 20]
  3991. + machh.w r8, r1:b, r9:t // r8 = tmp2
  3992. + machh.w r6, r1:t, r9:b // r6 = tmp3
  3993. + mulhh.w r9, r7:b, lr:t // r9 = z1
  3994. + mulhh.w r7, r7:t, lr:b // r7 = z2
  3995. +
  3996. +
  3997. + add r12, r9
  3998. + add r0, r7
  3999. + add r8, r7
  4000. + add r6, r9
  4001. +
  4002. + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
  4003. + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
  4004. + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
  4005. + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
  4006. + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
  4007. + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
  4008. + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
  4009. + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
  4010. +
  4011. + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
  4012. + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
  4013. + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
  4014. + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
  4015. + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
  4016. + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
  4017. + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
  4018. + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
  4019. +
  4020. + packw.sh r1, r1, r6
  4021. + packw.sh r8, r8, r0
  4022. + packw.sh r3, r3, r5
  4023. + packw.sh r4, r4, r2
  4024. +
  4025. + lddsp r12, SP[0] // rfp
  4026. + lddsp r9, SP[4] // iinc
  4027. + ld.d r6, r12[0]
  4028. + sub r10, -2 // Increment the dataptr
  4029. + punpckub.h r0, r7:t
  4030. + padd.h r1, r1, r0
  4031. + punpckub.h r0, r7:b
  4032. + padd.h r8, r8, r0
  4033. + packsh.ub r7, r1, r8
  4034. + punpckub.h r0, r6:t
  4035. + padd.h r3, r3, r0
  4036. + punpckub.h r0, r6:b
  4037. + padd.h r4, r4, r0
  4038. + packsh.ub r6, r3, r4
  4039. + st.d r12[0], r6
  4040. + add r12, r9 // increment rfp
  4041. + stdsp SP[0], r12
  4042. +
  4043. + sub loop_cnt, 1 //Decrement loop counter
  4044. + brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero
  4045. +
  4046. + sub sp, -8
  4047. + popm r0-r3, r4-r7, pc //Pop back registers and PC
  4048. +
  4049. +
  4050. +
  4051. +//Coeffisient Table:
  4052. + .align 2
  4053. +coef_table:
  4054. + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
  4055. + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
  4056. + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
  4057. +
  4058. +
  4059. +idct_put_avr32:
  4060. + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
  4061. +
  4062. + //; Give room for some variables on the stack
  4063. + sub sp, 8
  4064. + stdsp SP[0], r12 // rfp
  4065. + stdsp SP[4], r11 // iinc
  4066. +
  4067. + mov loop_cnt, 8 //Initialize loop counter
  4068. +
  4069. +0:
  4070. +
  4071. + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
  4072. + mov r6, 0
  4073. +#ifdef USE_PREFETCH
  4074. + pref r10[LINE_SIZE] //Prefetch next line
  4075. +#endif
  4076. + or r4, r2, r3 << 16
  4077. + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
  4078. + or r4, r0
  4079. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4080. +
  4081. + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
  4082. + plsl.h r5, r5, PASS1_BITS
  4083. + mov r4, r5
  4084. + st.d r10++, r4
  4085. + st.d r10++, r4
  4086. +
  4087. + sub loop_cnt, 1 //Decrement loop counter
  4088. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4089. +
  4090. + bral 2f //Perform coloumn transform after row transform is computed
  4091. +
  4092. +1:
  4093. +
  4094. + ld.w r12, pc[coef_table_copy - .]
  4095. + ld.w r9, pc[coef_table_copy - . + 4]
  4096. +
  4097. + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
  4098. + mulhh.w r5, r4:t, r12:t
  4099. + mulhh.w r6, r0:t, r12:b
  4100. + ld.w r12, pc[coef_table_copy - . + 8]
  4101. + mulhh.w r7, r2:t, r9:t
  4102. + add r6, r5 // tmp2
  4103. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  4104. + add r7, r5 // tmp3
  4105. + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
  4106. +
  4107. + paddsub.h r5, r3:t, r1:t
  4108. + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
  4109. +
  4110. + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
  4111. + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
  4112. +
  4113. +
  4114. +
  4115. + addhh.w lr, r3:b, r1:b // lr = z4
  4116. + addhh.w r5, r4:b, lr:b
  4117. + mulhh.w r5, r5:b, r9:b // r5 = z5
  4118. +
  4119. + ld.w r9, pc[coef_table_copy - . + 12]
  4120. + mulhh.w r4, r4:b, r12:t // r4 = z3
  4121. + mulhh.w lr, lr:b, r12:b // lr = z4
  4122. +
  4123. + add r4, r5
  4124. + add lr, r5
  4125. +
  4126. + addhh.w r5, r2:b, r1:b // r5 = z2
  4127. + addhh.w r8, r3:b, r0:b // r8 = z1
  4128. +
  4129. +
  4130. + mulhh.w r0, r0:b, r9:t // r0 = tmp0
  4131. + ld.w r12, pc[coef_table_copy - . + 16]
  4132. + mulhh.w r1, r1:b, r9:b // r1 = tmp1
  4133. + ld.w r9, pc[coef_table_copy - . + 20]
  4134. + mulhh.w r2, r2:b, r12:t // r2 = tmp2
  4135. + mulhh.w r3, r3:b, r12:b // r3 = tmp3
  4136. + mulhh.w r8, r8:b, r9:t // r8 = z1
  4137. + mulhh.w r5, r5:b, r9:b // r5 = z2
  4138. +
  4139. +
  4140. + add r0, r8
  4141. + add r0, r4
  4142. + add r1, r5
  4143. + add r1, lr
  4144. + add r2, r5
  4145. + add r2, r4
  4146. + add r3, r8
  4147. + add r3, lr
  4148. +
  4149. + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
  4150. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  4151. + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
  4152. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  4153. +
  4154. + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
  4155. + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
  4156. + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
  4157. + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
  4158. +
  4159. + sthh.w r10[0], r4:t, r5:t
  4160. + sthh.w r10[4], r3:t, r2:t
  4161. + sthh.w r10[8], r2:b, r3:b
  4162. + sthh.w r10[12], r5:b, r4:b
  4163. +
  4164. +
  4165. +
  4166. + sub r10, -16
  4167. + sub loop_cnt, 1
  4168. + brne 0b
  4169. +
  4170. +2:
  4171. +
  4172. + sub r10, 128 //Set pointer to start of DCT block
  4173. +
  4174. + mov loop_cnt, 8
  4175. +
  4176. +0:
  4177. + ldins.h r3:t,r10[0] // r3:t = dataptr[0]
  4178. + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
  4179. + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
  4180. + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
  4181. + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
  4182. + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
  4183. + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
  4184. + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
  4185. +
  4186. + or r4, r1, r3 << 16
  4187. + or r4, r2
  4188. + or r4, r0
  4189. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4190. +
  4191. + lddsp r12, SP[0] // rfp
  4192. + lddsp r9, SP[4] // iinc
  4193. + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
  4194. + packw.sh r3, r3, r3
  4195. + packsh.ub r3, r3, r3
  4196. + mov r2, r3
  4197. + st.d r12[0], r2
  4198. + add r12, r9 // increment rfp
  4199. + sub r10, -2 // Increment the dataptr
  4200. + stdsp SP[0], r12
  4201. +
  4202. + sub loop_cnt, 1//Decrement loop counter
  4203. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4204. +
  4205. + sub sp, -8
  4206. + popm r0-r3, r4-r7, pc//Pop back registers and PC
  4207. +
  4208. +1:
  4209. +
  4210. + ld.w r12, pc[coef_table_copy - .]
  4211. + ld.w r9, pc[coef_table_copy - . + 4]
  4212. +
  4213. + addhh.w r4, r2:t, r2:b
  4214. + mulhh.w r4, r4:b, r12:t // r4 = z1
  4215. + mulhh.w r5, r2:b, r12:b
  4216. + ld.w r12, pc[coef_table_copy - . + 8]
  4217. + mulhh.w r6, r2:t, r9:t
  4218. + add r5, r4 // r5 = tmp2
  4219. + add r6, r4 // r6 = tmp3
  4220. +
  4221. + addhh.w r7, r3:t, r3:b
  4222. + subhh.w r8, r3:t, r3:b
  4223. +
  4224. + lsl r7, CONST_BITS
  4225. + lsl r8, CONST_BITS
  4226. +
  4227. + add r2, r7, r6 // r2 = tmp10
  4228. + sub r3, r7, r6 // r3 = tmp13
  4229. + add r4, r8, r5 // r4 = tmp11
  4230. + sub r5, r8, r5 // r5 = tmp12
  4231. +
  4232. +
  4233. + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
  4234. + addhh.w r7, r6:t, r6:b
  4235. + mulhh.w r7, r7:b, r9:b // r7 = z5
  4236. +
  4237. + ld.w r9, pc[coef_table_copy - . + 12]
  4238. + mulhh.w r8, r6:b, r12:t // r8 = z3
  4239. + mulhh.w r6, r6:t, r12:b // r6 = z4
  4240. +
  4241. + add r8, r7
  4242. + add r6, r7
  4243. +
  4244. + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
  4245. +
  4246. + mulhh.w r12, r0:b, r9:t // r12 = tmp0
  4247. + mulhh.w r0, r0:t, r9:b // r0 = tmp1
  4248. + ld.w r9, pc[coef_table_copy - . + 16]
  4249. + add r12, r8
  4250. + add r0, r6
  4251. +
  4252. + ld.w lr, pc[coef_table_copy - . + 20]
  4253. + machh.w r8, r1:b, r9:t // r8 = tmp2
  4254. + machh.w r6, r1:t, r9:b // r6 = tmp3
  4255. + mulhh.w r9, r7:b, lr:t // r9 = z1
  4256. + mulhh.w r7, r7:t, lr:b // r7 = z2
  4257. +
  4258. +
  4259. + add r12, r9
  4260. + add r0, r7
  4261. + add r8, r7
  4262. + add r6, r9
  4263. +
  4264. + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
  4265. + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
  4266. + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
  4267. + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
  4268. + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
  4269. + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
  4270. + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
  4271. + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
  4272. +
  4273. + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
  4274. + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
  4275. + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
  4276. + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
  4277. + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
  4278. + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
  4279. + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
  4280. + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
  4281. +
  4282. + packw.sh r1, r1, r6
  4283. + packw.sh r8, r8, r0
  4284. + packw.sh r3, r3, r5
  4285. + packw.sh r4, r4, r2
  4286. +
  4287. + packsh.ub r1, r1, r8
  4288. + packsh.ub r0, r3, r4
  4289. + lddsp r12, SP[0] // rfp
  4290. + lddsp r9, SP[4] // iinc
  4291. + st.d r12[0], r0
  4292. + sub r10, -2 // Increment the dataptr
  4293. + add r12, r9 // increment rfp
  4294. + stdsp SP[0], r12
  4295. +
  4296. + sub loop_cnt, 1 //Decrement loop counter
  4297. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4298. +
  4299. + sub sp, -8
  4300. + popm r0-r3, r4-r7, pc //Pop back registers and PC
  4301. +
  4302. +
  4303. +
  4304. + .align 2
  4305. +coef_table_copy:
  4306. + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
  4307. + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
  4308. + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
  4309. +
  4310. +
  4311. +idct_avr32:
  4312. + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
  4313. +
  4314. + //; Give room for a temporary block on the stack
  4315. + sub sp, 8*8*2
  4316. +
  4317. + mov loop_cnt, 8 //Initialize loop counter
  4318. +
  4319. +0:
  4320. +
  4321. + ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
  4322. + mov r6, 0
  4323. +#ifdef USE_PREFETCH
  4324. + pref r12[LINE_SIZE] //Prefetch next line
  4325. +#endif
  4326. + or r4, r2, r3 << 16
  4327. + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
  4328. + or r4, r0
  4329. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4330. +
  4331. + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
  4332. + plsl.h r5, r5, PASS1_BITS
  4333. + mov r4, r5
  4334. + st.d sp++, r4
  4335. + st.d sp++, r4
  4336. +
  4337. + sub loop_cnt, 1 //Decrement loop counter
  4338. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4339. +
  4340. + bral 2f //Perform coloumn transform after row transform is computed
  4341. +
  4342. +1:
  4343. +
  4344. + ld.w r10, pc[coef_table_idct - .]
  4345. + ld.w r9, pc[coef_table_idct - . + 4]
  4346. +
  4347. + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
  4348. + mulhh.w r5, r4:t, r10:t
  4349. + mulhh.w r6, r0:t, r10:b
  4350. + ld.w r10, pc[coef_table_idct - . + 8]
  4351. + mulhh.w r7, r2:t, r9:t
  4352. + add r6, r5 // tmp2
  4353. + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
  4354. + add r7, r5 // tmp3
  4355. + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
  4356. +
  4357. + paddsub.h r5, r3:t, r1:t
  4358. + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
  4359. +
  4360. + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
  4361. + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
  4362. +
  4363. +
  4364. +
  4365. + addhh.w lr, r3:b, r1:b // lr = z4
  4366. + addhh.w r5, r4:b, lr:b
  4367. + mulhh.w r5, r5:b, r9:b // r5 = z5
  4368. +
  4369. + ld.w r9, pc[coef_table_idct - . + 12]
  4370. + mulhh.w r4, r4:b, r10:t // r4 = z3
  4371. + mulhh.w lr, lr:b, r10:b // lr = z4
  4372. +
  4373. + add r4, r5
  4374. + add lr, r5
  4375. +
  4376. + addhh.w r5, r2:b, r1:b // r5 = z2
  4377. + addhh.w r8, r3:b, r0:b // r8 = z1
  4378. +
  4379. +
  4380. + mulhh.w r0, r0:b, r9:t // r0 = tmp0
  4381. + ld.w r10, pc[coef_table_idct - . + 16]
  4382. + mulhh.w r1, r1:b, r9:b // r1 = tmp1
  4383. + ld.w r9, pc[coef_table_idct - . + 20]
  4384. + mulhh.w r2, r2:b, r10:t // r2 = tmp2
  4385. + mulhh.w r3, r3:b, r10:b // r3 = tmp3
  4386. + mulhh.w r8, r8:b, r9:t // r8 = z1
  4387. + mulhh.w r5, r5:b, r9:b // r5 = z2
  4388. +
  4389. +
  4390. + add r0, r8
  4391. + add r0, r4
  4392. + add r1, r5
  4393. + add r1, lr
  4394. + add r2, r5
  4395. + add r2, r4
  4396. + add r3, r8
  4397. + add r3, lr
  4398. +
  4399. + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
  4400. + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
  4401. + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
  4402. + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
  4403. +
  4404. + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
  4405. + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
  4406. + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
  4407. + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
  4408. +
  4409. + sthh.w sp[0], r4:t, r5:t
  4410. + sthh.w sp[4], r3:t, r2:t
  4411. + sthh.w sp[8], r2:b, r3:b
  4412. + sthh.w sp[12], r5:b, r4:b
  4413. +
  4414. +
  4415. +
  4416. + sub sp, -16
  4417. + sub loop_cnt, 1
  4418. + brne 0b
  4419. +
  4420. +2:
  4421. +
  4422. + sub sp, 8*8*2 //Set pointer to start of DCT block
  4423. + sub r12, 8*8*2 //Set pointer to start of DCT block
  4424. +
  4425. + mov loop_cnt, 8
  4426. +
  4427. +0:
  4428. + ldins.h r3:t,sp[0] // r3:t = dataptr[0]
  4429. + ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1]
  4430. + ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2]
  4431. + ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5]
  4432. + ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4]
  4433. + ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3]
  4434. + ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6]
  4435. + ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7]
  4436. +
  4437. + or r4, r1, r3 << 16
  4438. + or r4, r2
  4439. + or r4, r0
  4440. + brne 1f //If there are non-zero AC coeffisients perform row-transform
  4441. +
  4442. + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
  4443. + packw.sh r3, r3, r3
  4444. + mov r2, r3
  4445. + st.d r12++, r2
  4446. + st.d r12++, r2
  4447. + sub sp, -2 // Increment the dataptr
  4448. +
  4449. + sub loop_cnt, 1//Decrement loop counter
  4450. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4451. +
  4452. + sub sp, -(8*8*2 - 8)
  4453. + popm r0-r3, r4-r7, pc//Pop back registers and PC
  4454. +
  4455. +1:
  4456. +
  4457. + ld.w r10, pc[coef_table_idct - .]
  4458. + ld.w r9, pc[coef_table_idct - . + 4]
  4459. +
  4460. + addhh.w r4, r2:t, r2:b
  4461. + mulhh.w r4, r4:b, r10:t // r4 = z1
  4462. + mulhh.w r5, r2:b, r10:b
  4463. + ld.w r10, pc[coef_table_idct - . + 8]
  4464. + mulhh.w r6, r2:t, r9:t
  4465. + add r5, r4 // r5 = tmp2
  4466. + add r6, r4 // r6 = tmp3
  4467. +
  4468. + addhh.w r7, r3:t, r3:b
  4469. + subhh.w r8, r3:t, r3:b
  4470. +
  4471. + lsl r7, CONST_BITS
  4472. + lsl r8, CONST_BITS
  4473. +
  4474. + add r2, r7, r6 // r2 = tmp10
  4475. + sub r3, r7, r6 // r3 = tmp13
  4476. + add r4, r8, r5 // r4 = tmp11
  4477. + sub r5, r8, r5 // r5 = tmp12
  4478. +
  4479. +
  4480. + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
  4481. + addhh.w r7, r6:t, r6:b
  4482. + mulhh.w r7, r7:b, r9:b // r7 = z5
  4483. +
  4484. + ld.w r9, pc[coef_table_idct - . + 12]
  4485. + mulhh.w r8, r6:b, r10:t // r8 = z3
  4486. + mulhh.w r6, r6:t, r10:b // r6 = z4
  4487. +
  4488. + add r8, r7
  4489. + add r6, r7
  4490. +
  4491. + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
  4492. +
  4493. + mulhh.w r10, r0:b, r9:t // r10 = tmp0
  4494. + mulhh.w r0, r0:t, r9:b // r0 = tmp1
  4495. + ld.w r9, pc[coef_table_idct - . + 16]
  4496. + add r10, r8
  4497. + add r0, r6
  4498. +
  4499. + ld.w lr, pc[coef_table_idct - . + 20]
  4500. + machh.w r8, r1:b, r9:t // r8 = tmp2
  4501. + machh.w r6, r1:t, r9:b // r6 = tmp3
  4502. + mulhh.w r9, r7:b, lr:t // r9 = z1
  4503. + mulhh.w r7, r7:t, lr:b // r7 = z2
  4504. +
  4505. +
  4506. + add r10, r9
  4507. + add r0, r7
  4508. + add r8, r7
  4509. + add r6, r9
  4510. +
  4511. + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
  4512. + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
  4513. + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
  4514. + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
  4515. + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
  4516. + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
  4517. + add r0, r3, r10 // r0 = dataptr[DCTSIZE*3]
  4518. + sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4]
  4519. +
  4520. + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
  4521. + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
  4522. + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
  4523. + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
  4524. + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
  4525. + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
  4526. + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
  4527. + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
  4528. +
  4529. + packw.sh r7, r1, r6
  4530. + packw.sh r6, r8, r0
  4531. + packw.sh r5, r3, r5
  4532. + packw.sh r4, r4, r2
  4533. +
  4534. + stm r12, r4-r7
  4535. + sub sp, -2 // Increment the dataptr
  4536. + sub r12, -16
  4537. +
  4538. + sub loop_cnt, 1 //Decrement loop counter
  4539. + brne 0b //Perform loop one more time if loop_cnt is not zero
  4540. +
  4541. + sub sp, -(8*8*2 - 8)
  4542. + popm r0-r3, r4-r7, pc //Pop back registers and PC
  4543. +
  4544. +
  4545. +
  4546. + .align 2
  4547. +coef_table_idct:
  4548. + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
  4549. + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
  4550. + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
  4551. +
  4552. --- /dev/null
  4553. +++ b/libavcodec/avr32/mc.S
  4554. @@ -0,0 +1,434 @@
  4555. +/*
  4556. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  4557. + *
  4558. + * Redistribution and use in source and binary forms, with or without
  4559. + * modification, are permitted provided that the following conditions
  4560. + * are met:
  4561. + *
  4562. + * 1. Redistributions of source code must retain the above copyright
  4563. + * notice, this list of conditions and the following disclaimer.
  4564. + *
  4565. + * 2. Redistributions in binary form must reproduce the above
  4566. + * copyright notice, this list of conditions and the following
  4567. + * disclaimer in the documentation and/or other materials provided
  4568. + * with the distribution.
  4569. + *
  4570. + * 3. The name of ATMEL may not be used to endorse or promote products
  4571. + * derived from this software without specific prior written
  4572. + * permission.
  4573. + *
  4574. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  4575. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  4576. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  4577. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  4578. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  4579. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  4580. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  4581. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  4582. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  4583. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  4584. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  4585. + * DAMAGE.
  4586. + */
  4587. +
  4588. +
  4589. + /* Macro for masking the lowest bit of each byte in a
  4590. + packed word */
  4591. + .macro packedmask1 reg, round
  4592. + .if \round
  4593. + and \reg, \reg, r8 >> 1
  4594. + .else
  4595. + and \reg, r8
  4596. + .endif
  4597. + .endm
  4598. +
  4599. + /* Macro for 8 pixel wide horizontal and vertical interpolation functions */
  4600. + .macro pixels8_hv round, put
  4601. +
  4602. +
  4603. + pushm r0-r7, lr
  4604. +
  4605. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4606. +
  4607. + /* Rounding immediate */
  4608. + .if \round
  4609. + mov r8, lo(0x02020202)
  4610. + orh r8, hi(0x02020202)
  4611. + .else
  4612. + mov r8, lo(0x01010101)
  4613. + orh r8, hi(0x01010101)
  4614. + .endif
  4615. + mov r7, 2
  4616. +
  4617. + /* Pixel naming convention :
  4618. +
  4619. + |-----------------------------------------------------|
  4620. + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 |
  4621. + |----d00---d01---d02---d03---d04---d05---d06---d07----|
  4622. + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 |
  4623. + |-----------------------------------------------------|
  4624. + */
  4625. +1:
  4626. + ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 }
  4627. + ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 }
  4628. + mov lr, r9
  4629. + eor r2, r0, r1
  4630. + packedmask1 r2, \round
  4631. + add r2, r8
  4632. +
  4633. + paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4634. +
  4635. + add r11, r10 // pixels += line_size
  4636. + ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 }
  4637. + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
  4638. +0:
  4639. + eor r5, r1, r3
  4640. + packedmask1 r5, \round
  4641. + add r2, r5
  4642. +
  4643. + paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2}
  4644. + eor r6, r0, r1
  4645. + packedmask1 r6, \round
  4646. + add r2, r2, r6 << 1
  4647. +
  4648. + ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 }
  4649. + add r11, r10 // pixels += line_size
  4650. + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
  4651. +
  4652. + paddh.ub r0, r0, r1
  4653. + plsr.b r2, r2, 2
  4654. + padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 }
  4655. +
  4656. + /* Next row */
  4657. + .if \put
  4658. + eor r2, r3, r4
  4659. + packedmask1 r2, \round
  4660. + add r2, r8
  4661. + .else
  4662. + ld.w r6, r12[0]
  4663. + eor r2, r3, r4
  4664. + packedmask1 r2, \round
  4665. + add r2, r8
  4666. + pavg.ub r0, r0, r6
  4667. + .endif
  4668. + st.w r12[0], r0 // Put data into the block
  4669. +
  4670. + add r5, r2
  4671. + paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4672. +
  4673. + eor r6, r0, r1
  4674. + packedmask1 r6, \round
  4675. + add r5, r5, r6 << 1
  4676. +
  4677. + .if \put
  4678. + paddh.ub r1, r0, r1
  4679. + plsr.b r5, r5, 2
  4680. + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
  4681. + .else
  4682. + ld.w r3, r12[r10]
  4683. + paddh.ub r1, r0, r1
  4684. + plsr.b r5, r5, 2
  4685. + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
  4686. + pavg.ub r1, r1, r3
  4687. + .endif
  4688. +
  4689. + st.w r12[r10], r1 // Put data into the block
  4690. +
  4691. +
  4692. + ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 }
  4693. + add r11, r10 // pixels += line_size
  4694. + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
  4695. + add r12, r12, r10 << 1 // block += 2*line_size
  4696. + sub lr, 2
  4697. + brne 0b
  4698. +
  4699. + mul r0, r10, r9 // r0 = line_size * h
  4700. + rsub r0, r0, 4 // r0 = 4 - (line_size * h)
  4701. + add r11, r0
  4702. + sub r11, r10 // pixels += 4 - (line_size * (h+1))
  4703. + add r12, r0 // pixels += 4 - (line_size * (h))
  4704. + sub r7, 1
  4705. + brne 1b
  4706. +
  4707. + popm r0-r7, pc
  4708. + .endm
  4709. +
  4710. +
  4711. + /* Macro for 8 pixel wide vertical interpolation functions */
  4712. +
  4713. + .macro pixels8_v round, put
  4714. + pushm r4-r7,lr
  4715. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4716. +
  4717. + /*
  4718. + Pixel Naming Convention :
  4719. + |-----------------------------------------------|
  4720. + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 |
  4721. + |-d00---d01---d02---d03---d04---d05---d06---d07-|
  4722. + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 |
  4723. + |-----------------------------------------------|
  4724. + */
  4725. + ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 }
  4726. + ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4
  4727. + ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 }
  4728. + ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 }
  4729. + sub r10, 4 // stride -= 4
  4730. + add r11, r11, r10 << 1 // src += 2*stride
  4731. + sub r11, -4 // src += 4
  4732. +
  4733. +0:
  4734. + .if \round
  4735. + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4736. + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4737. + .else
  4738. + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4739. + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4740. + .endif
  4741. +
  4742. + .if \put
  4743. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4744. + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
  4745. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4746. + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
  4747. + .else
  4748. + ld.w lr, r12[0]
  4749. + ld.w r7, r12[4]
  4750. + pavg.ub r5, r5, lr
  4751. + pavg.ub r4, r4, r7
  4752. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4753. + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
  4754. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4755. + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
  4756. + .endif
  4757. + add r11, r10 // src += stride
  4758. +#ifdef USE_PREFETCH
  4759. + pref r11[0]
  4760. +#endif
  4761. + add r12, r10 // dst += stride
  4762. +
  4763. + .if \round
  4764. + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4765. + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4766. + .else
  4767. + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
  4768. + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
  4769. + .endif
  4770. + .if \put
  4771. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4772. + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
  4773. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4774. + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
  4775. + .else
  4776. + ld.w r8, r12[0]
  4777. + ld.w r6, r12[4]
  4778. + pavg.ub r5, r5, r8
  4779. + pavg.ub r4, r4, r6
  4780. + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
  4781. + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
  4782. + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
  4783. + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
  4784. + .endif
  4785. +
  4786. + add r11, r10 // src += stride
  4787. +#ifdef USE_PREFETCH
  4788. + pref r11[0]
  4789. +#endif
  4790. + add r12, r10 // dst += stride
  4791. + sub r9, 2
  4792. + brne 0b
  4793. +
  4794. + popm r4-r7,pc
  4795. + .endm
  4796. +
  4797. + /* Macro for 8 pixel wide horizontal interpolation functions */
  4798. +
  4799. + .macro pixels8_h round, put
  4800. + pushm r4-r7, lr
  4801. +
  4802. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4803. + /*
  4804. + Pixel Naming Convention:
  4805. + |--------------------------------------------------------------------|
  4806. + | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08|
  4807. + |------|-------|-------|-------|-------|-------|-------|-------|-----|
  4808. + | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18|
  4809. + |--------------------------------------------------------------------|
  4810. + */
  4811. +
  4812. + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
  4813. + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
  4814. + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
  4815. + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
  4816. + add r11, r10 // src += stride
  4817. +
  4818. +0:
  4819. + .if \round
  4820. + pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4821. + pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4822. + .else
  4823. + paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4824. + paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4825. + .endif
  4826. + .if \put
  4827. + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
  4828. + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
  4829. + .else
  4830. + ld.w r8, r12[0]
  4831. + ld.w r6, r12[4]
  4832. + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
  4833. + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
  4834. + pavg.ub lr, lr, r8
  4835. + pavg.ub r7, r7, r6
  4836. + .endif
  4837. + st.w r12[0], lr // dst = { d00, d01, d02, d03 }
  4838. + st.w r12[4], r7 // dst = { d04, d05, d06, d07 }
  4839. + ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 }
  4840. + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
  4841. + add r11, r10 // src += stride
  4842. +#ifdef USE_PREFETCH
  4843. + pref r11[0]
  4844. +#endif
  4845. + add r12, r10 // dst += stride
  4846. +
  4847. + .if \round
  4848. + pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4849. + pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4850. + .else
  4851. + paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
  4852. + paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
  4853. + .endif
  4854. + .if \put
  4855. + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
  4856. + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
  4857. + .else
  4858. + ld.w r7, r12[0]
  4859. + ld.w r6, r12[4]
  4860. + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
  4861. + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
  4862. + pavg.ub r5, r5, r7
  4863. + pavg.ub r4, r4, r6
  4864. + .endif
  4865. + st.w r12[0], r5 // dst = { d00, d01, d02, d03 }
  4866. + st.w r12[4], r4 // dst = { d04, d05, d06, d07 }
  4867. + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
  4868. + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
  4869. + add r11, r10 // src += stride
  4870. +#ifdef USE_PREFETCH
  4871. + pref r11[0]
  4872. +#endif
  4873. + add r12, r10 // dst += stride
  4874. + sub r9, 2
  4875. + brne 0b
  4876. +
  4877. + popm r4-r7, pc
  4878. + .endm
  4879. +
  4880. + /* Macro for 8 pixel wide copy functions */
  4881. + .macro pixels8 put
  4882. + stm --sp, r3-r7,lr
  4883. + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
  4884. + mov lr, r9
  4885. + sub r3, r10, 2 // stride2 = stride - 2
  4886. +0:
  4887. + .if \put
  4888. + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
  4889. + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
  4890. + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
  4891. + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
  4892. + .else
  4893. + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
  4894. + ld.d r4, r12[0]
  4895. + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
  4896. + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
  4897. + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
  4898. + pavg.ub r6, r6, r4
  4899. + pavg.ub r7, r7, r5
  4900. + ld.d r4, r12[r10]
  4901. + .endif
  4902. + st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 }
  4903. + add r11, r11, r3 << 1 // src += stride2 * 2
  4904. + .ifeq \put
  4905. + pavg.ub r8, r8, r4
  4906. + pavg.ub r9, r9, r5
  4907. + .endif
  4908. + st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 }
  4909. + add r12, r12, r10 << 1 // dst += 2*stride
  4910. + sub lr, 2
  4911. + brne 0b
  4912. + ldm sp++, r3-r7,pc
  4913. +
  4914. + .endm
  4915. +
  4916. + .global put_no_rnd_pixels8_hv_avr32
  4917. + .text
  4918. +put_no_rnd_pixels8_hv_avr32:
  4919. + pixels8_hv 0, 1
  4920. +
  4921. + .global put_pixels8_hv_avr32
  4922. + .text
  4923. +put_pixels8_hv_avr32:
  4924. + pixels8_hv 1, 1
  4925. +
  4926. + .global avg_no_rnd_pixels8_hv_avr32
  4927. + .text
  4928. +avg_no_rnd_pixels8_hv_avr32:
  4929. + pixels8_hv 0, 0
  4930. +
  4931. + .global avg_pixels8_hv_avr32
  4932. + .text
  4933. +avg_pixels8_hv_avr32:
  4934. + pixels8_hv 1, 0
  4935. +
  4936. + .global put_no_rnd_pixels8_v_avr32
  4937. + .text
  4938. +put_no_rnd_pixels8_v_avr32:
  4939. + pixels8_v 0, 1
  4940. +
  4941. + .global put_pixels8_v_avr32
  4942. + .text
  4943. +put_pixels8_v_avr32:
  4944. + pixels8_v 1, 1
  4945. +
  4946. + .global avg_no_rnd_pixels8_v_avr32
  4947. + .text
  4948. +avg_no_rnd_pixels8_v_avr32:
  4949. + pixels8_v 0, 0
  4950. +
  4951. + .global avg_pixels8_v_avr32
  4952. + .text
  4953. +avg_pixels8_v_avr32:
  4954. + pixels8_v 1, 0
  4955. +
  4956. + .global put_no_rnd_pixels8_h_avr32
  4957. + .text
  4958. +put_no_rnd_pixels8_h_avr32:
  4959. + pixels8_h 0, 1
  4960. +
  4961. + .global put_pixels8_h_avr32
  4962. + .text
  4963. +put_pixels8_h_avr32:
  4964. + pixels8_h 1, 1
  4965. +
  4966. + .global avg_no_rnd_pixels8_h_avr32
  4967. + .text
  4968. +avg_no_rnd_pixels8_h_avr32:
  4969. + pixels8_h 0, 0
  4970. +
  4971. + .global avg_pixels8_h_avr32
  4972. + .text
  4973. +avg_pixels8_h_avr32:
  4974. + pixels8_h 1, 0
  4975. +
  4976. + .global put_pixels8_avr32
  4977. + .global put_no_rnd_pixels8_avr32
  4978. + .text
  4979. +put_pixels8_avr32:
  4980. +put_no_rnd_pixels8_avr32:
  4981. + pixels8 1
  4982. +
  4983. + .global avg_no_rnd_pixels8_avr32
  4984. + .global avg_pixels8_avr32
  4985. + .text
  4986. +avg_pixels8_avr32:
  4987. +avg_no_rnd_pixels8_avr32:
  4988. + pixels8 0
  4989. --- /dev/null
  4990. +++ b/libavcodec/avr32/pico.h
  4991. @@ -0,0 +1,260 @@
  4992. +/*
  4993. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  4994. + *
  4995. + * Redistribution and use in source and binary forms, with or without
  4996. + * modification, are permitted provided that the following conditions
  4997. + * are met:
  4998. + *
  4999. + * 1. Redistributions of source code must retain the above copyright
  5000. + * notice, this list of conditions and the following disclaimer.
  5001. + *
  5002. + * 2. Redistributions in binary form must reproduce the above
  5003. + * copyright notice, this list of conditions and the following
  5004. + * disclaimer in the documentation and/or other materials provided
  5005. + * with the distribution.
  5006. + *
  5007. + * 3. The name of ATMEL may not be used to endorse or promote products
  5008. + * derived from this software without specific prior written
  5009. + * permission.
  5010. + *
  5011. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  5012. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  5013. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  5014. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  5015. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  5016. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  5017. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  5018. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  5019. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  5020. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  5021. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  5022. + * DAMAGE.
  5023. + */
  5024. +#ifndef __PICO_H__
  5025. +#define __PICO_H__
  5026. +
  5027. +
  5028. +
  5029. +/* Coprocessor Number */
  5030. +#define PICO_CPNO 1
  5031. +
  5032. +/* Pixel Coprocessor Register file */
  5033. +#define PICO_REGVECT_INPIX2 cr0
  5034. +#define PICO_REGVECT_INPIX1 cr1
  5035. +#define PICO_REGVECT_INPIX0 cr2
  5036. +#define PICO_REGVECT_OUTPIX2 cr3
  5037. +#define PICO_REGVECT_OUTPIX1 cr4
  5038. +#define PICO_REGVECT_OUTPIX0 cr5
  5039. +#define PICO_REGVECT_COEFF0_A cr6
  5040. +#define PICO_REGVECT_COEFF0_B cr7
  5041. +#define PICO_REGVECT_COEFF1_A cr8
  5042. +#define PICO_REGVECT_COEFF1_B cr9
  5043. +#define PICO_REGVECT_COEFF2_A cr10
  5044. +#define PICO_REGVECT_COEFF2_B cr11
  5045. +#define PICO_REGVECT_VMU0_OUT cr12
  5046. +#define PICO_REGVECT_VMU1_OUT cr13
  5047. +#define PICO_REGVECT_VMU2_OUT cr14
  5048. +#define PICO_REGVECT_CONFIG cr15
  5049. +
  5050. +#define PICO_INPIX2 0
  5051. +#define PICO_INPIX1 1
  5052. +#define PICO_INPIX0 2
  5053. +#define PICO_OUTPIX2 3
  5054. +#define PICO_OUTPIX1 4
  5055. +#define PICO_OUTPIX0 5
  5056. +#define PICO_COEFF0_A 6
  5057. +#define PICO_COEFF0_B 7
  5058. +#define PICO_COEFF1_A 8
  5059. +#define PICO_COEFF1_B 9
  5060. +#define PICO_COEFF2_A 10
  5061. +#define PICO_COEFF2_B 11
  5062. +#define PICO_VMU0_OUT 12
  5063. +#define PICO_VMU1_OUT 13
  5064. +#define PICO_VMU2_OUT 14
  5065. +#define PICO_CONFIG 15
  5066. +
  5067. +/* Config Register */
  5068. +#define PICO_COEFF_FRAC_BITS_OFFSET 0
  5069. +#define PICO_COEFF_FRAC_BITS_SIZE 4
  5070. +#define PICO_OFFSET_FRAC_BITS_OFFSET 4
  5071. +#define PICO_OFFSET_FRAC_BITS_SIZE 4
  5072. +#define PICO_INPUT_MODE_OFFSET 8
  5073. +#define PICO_INPUT_MODE_SIZE 2
  5074. +#define PICO_OUTPUT_MODE_OFFSET 10
  5075. +#define PICO_OUTPUT_MODE_SIZE 1
  5076. +
  5077. +struct pico_config_t {
  5078. + unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE;
  5079. + unsigned int output_mode : PICO_OUTPUT_MODE_SIZE;
  5080. + unsigned int input_mode : PICO_INPUT_MODE_SIZE;
  5081. + unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE;
  5082. + unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE;
  5083. + int vmu2_out;
  5084. + int vmu1_out;
  5085. + int vmu0_out;
  5086. + short coeff2_2;
  5087. + short coeff2_3;
  5088. + short coeff2_0;
  5089. + short coeff2_1;
  5090. + short coeff1_2;
  5091. + short coeff1_3;
  5092. + short coeff1_0;
  5093. + short coeff1_1;
  5094. + short coeff0_2;
  5095. + short coeff0_3;
  5096. + short coeff0_0;
  5097. + short coeff0_1;
  5098. +};
  5099. +
  5100. +
  5101. +#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET)
  5102. +#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET)
  5103. +#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET)
  5104. +#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET)
  5105. +
  5106. +#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1))
  5107. +#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1))
  5108. +#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1))
  5109. +#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1))
  5110. +
  5111. +enum pico_input_mode { PICO_TRANSFORMATION_MODE,
  5112. + PICO_HOR_FILTER_MODE,
  5113. + PICO_VERT_FILTER_MODE };
  5114. +
  5115. +enum pico_output_mode { PICO_PACKED_MODE,
  5116. + PICO_PLANAR_MODE };
  5117. +
  5118. +/* Bits in coefficients */
  5119. +#define PICO_COEFF_BITS 12
  5120. +
  5121. +/* Operation bits */
  5122. +#define PICO_MATRIX (0)
  5123. +#define PICO_USE_ACC (1 << 2)
  5124. +#define PICO_SINGLE_VECTOR (1 << 3)
  5125. +
  5126. +
  5127. +#define __str(x...) #x
  5128. +#define __xstr(x...) __str(x)
  5129. +
  5130. +#define PICO_PUT_W(pico_reg, x) \
  5131. + __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
  5132. +#define PICO_GET_W(pico_reg) \
  5133. + __builtin_mvcr_w(PICO_CPNO, pico_reg)
  5134. +
  5135. +#define PICO_MVCR_W(x, pico_reg) \
  5136. + asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
  5137. +
  5138. +#define PICO_MVRC_W(pico_reg, x) \
  5139. + asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
  5140. +
  5141. +#define PICO_PUT_D(pico_reg, x) \
  5142. + __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
  5143. +#define PICO_GET_D(pico_reg) \
  5144. + __builtin_mvcr_d(PICO_CPNO, pico_reg)
  5145. +
  5146. +#define PICO_MVCR_D(x, pico_reg) \
  5147. + asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
  5148. +#define PICO_MVRC_D(pico_reg, x) \
  5149. + asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
  5150. +
  5151. +#define PICO_STCM_W(ptr, pico_regs...) \
  5152. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5153. +#define PICO_STCM_D(ptr, pico_regs...) \
  5154. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5155. +
  5156. +#define PICO_STCM_W_DEC(ptr, pico_regs...) \
  5157. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5158. +#define PICO_STCM_D_DEC(ptr, pico_regs...) \
  5159. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5160. +
  5161. +#define PICO_LDCM_W(ptr, pico_regs...) \
  5162. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5163. +#define PICO_LDCM_D(ptr, pico_regs...) \
  5164. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5165. +
  5166. +#define PICO_LDCM_W_INC(ptr, pico_regs...) \
  5167. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5168. +#define PICO_LDCM_D_INC(ptr, pico_regs...) \
  5169. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5170. +
  5171. +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
  5172. + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
  5173. +
  5174. +static inline void set_pico_config(struct pico_config_t *config){
  5175. + PICO_LDCM_D(config,
  5176. + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
  5177. + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
  5178. + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
  5179. + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
  5180. + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
  5181. +}
  5182. +
  5183. +static inline void get_pico_config(struct pico_config_t *config){
  5184. + PICO_STCM_D(config,
  5185. + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
  5186. + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
  5187. + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
  5188. + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
  5189. + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
  5190. +}
  5191. +
  5192. +static inline void dump_pico_config(){
  5193. + struct pico_config_t pico_config;
  5194. + char *input_mode, *output_mode;
  5195. + get_pico_config(&pico_config);
  5196. +
  5197. +
  5198. + av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n");
  5199. + av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits);
  5200. + av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits);
  5201. +
  5202. + switch ( pico_config.input_mode ){
  5203. + case PICO_TRANSFORMATION_MODE:
  5204. + input_mode = "Transformation Mode";
  5205. + break;
  5206. + case PICO_HOR_FILTER_MODE:
  5207. + input_mode = "Horisontal Filter Mode";
  5208. + break;
  5209. + case PICO_VERT_FILTER_MODE:
  5210. + input_mode = "Vertical Filter Mode";
  5211. + break;
  5212. + default:
  5213. + input_mode = "Unknown Mode!!";
  5214. + break;
  5215. + }
  5216. + av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode);
  5217. +
  5218. + switch ( pico_config.output_mode ){
  5219. + case PICO_PLANAR_MODE:
  5220. + output_mode = "Planar Mode";
  5221. + break;
  5222. + case PICO_PACKED_MODE:
  5223. + output_mode = "Packed Mode";
  5224. + break;
  5225. + default:
  5226. + output_mode = "Unknown Mode!!";
  5227. + break;
  5228. + }
  5229. +
  5230. + av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode);
  5231. +
  5232. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits));
  5233. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits));
  5234. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits));
  5235. + av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits));
  5236. +
  5237. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits));
  5238. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits));
  5239. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits));
  5240. + av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits));
  5241. +
  5242. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits));
  5243. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits));
  5244. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits));
  5245. + av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits));
  5246. +}
  5247. +
  5248. +
  5249. +
  5250. +#endif
  5251. +
  5252. --- a/libavcodec/bitstream.h
  5253. +++ b/libavcodec/bitstream.h
  5254. @@ -178,7 +178,7 @@ typedef struct RL_VLC_ELEM {
  5255. #endif
  5256. /* used to avoid misaligned exceptions on some archs (alpha, ...) */
  5257. -#if defined(ARCH_X86)
  5258. +#if defined(ARCH_X86) || defined(ARCH_AVR32)
  5259. # define unaligned16(a) (*(const uint16_t*)(a))
  5260. # define unaligned32(a) (*(const uint32_t*)(a))
  5261. # define unaligned64(a) (*(const uint64_t*)(a))
  5262. @@ -810,6 +810,44 @@ void free_vlc(VLC *vlc);
  5263. * if the vlc code is invalid and max_depth>1 than the number of bits removed
  5264. * is undefined
  5265. */
  5266. +
  5267. +#if defined(ARCH_AVR32)
  5268. +#define GET_VLC(code, name, gb, table, bits, max_depth)\
  5269. +{\
  5270. + int n, index, nb_bits;\
  5271. + union { VLC_TYPE vlc[2];\
  5272. + uint32_t u32; } table_elem;\
  5273. +\
  5274. + index= SHOW_UBITS(name, gb, bits);\
  5275. + table_elem.u32 = unaligned32(&table[index]); \
  5276. + code = table_elem.vlc[0];\
  5277. + n = table_elem.vlc[1];\
  5278. +\
  5279. + if(max_depth > 1 && n < 0 ){\
  5280. + LAST_SKIP_BITS(name, gb, bits)\
  5281. + UPDATE_CACHE(name, gb)\
  5282. +\
  5283. + nb_bits = -n;\
  5284. +\
  5285. + index= SHOW_UBITS(name, gb, nb_bits) + code;\
  5286. + table_elem.u32 = unaligned32(&table[index]); \
  5287. + code = table_elem.vlc[0];\
  5288. + n = table_elem.vlc[1];\
  5289. + if(max_depth > 2 && n < 0){\
  5290. + LAST_SKIP_BITS(name, gb, nb_bits)\
  5291. + UPDATE_CACHE(name, gb)\
  5292. +\
  5293. + nb_bits = -n;\
  5294. +\
  5295. + index= SHOW_UBITS(name, gb, nb_bits) + code;\
  5296. + code = table[index][0];\
  5297. + n = table[index][1];\
  5298. + }\
  5299. + }\
  5300. + SKIP_BITS(name, gb, n)\
  5301. +}
  5302. +
  5303. +#else
  5304. #define GET_VLC(code, name, gb, table, bits, max_depth)\
  5305. {\
  5306. int n, index, nb_bits;\
  5307. @@ -818,7 +856,7 @@ void free_vlc(VLC *vlc);
  5308. code = table[index][0];\
  5309. n = table[index][1];\
  5310. \
  5311. - if(max_depth > 1 && n < 0){\
  5312. + if(max_depth > 1 && n < 0 ){\
  5313. LAST_SKIP_BITS(name, gb, bits)\
  5314. UPDATE_CACHE(name, gb)\
  5315. \
  5316. @@ -840,7 +878,38 @@ void free_vlc(VLC *vlc);
  5317. }\
  5318. SKIP_BITS(name, gb, n)\
  5319. }
  5320. +#endif
  5321. +#if defined(ARCH_AVR32)
  5322. +#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
  5323. +{\
  5324. + int n, index, nb_bits;\
  5325. + union { RL_VLC_ELEM vlc;\
  5326. + uint32_t u32; } table_elem;\
  5327. +\
  5328. + index= SHOW_UBITS(name, gb, bits);\
  5329. + table_elem.u32 = unaligned32(&table[index]); \
  5330. + level = table_elem.vlc.level;\
  5331. + n = table_elem.vlc.len;\
  5332. +\
  5333. + if(max_depth > 1 && n < 0 ){\
  5334. + SKIP_BITS(name, gb, bits)\
  5335. + if(need_update){\
  5336. + UPDATE_CACHE(name, gb)\
  5337. + }\
  5338. +\
  5339. + nb_bits = -n;\
  5340. +\
  5341. + index= SHOW_UBITS(name, gb, nb_bits) + level;\
  5342. + table_elem.u32 = unaligned32(&table[index]); \
  5343. + level = table_elem.vlc.level;\
  5344. + n = table_elem.vlc.len;\
  5345. + }\
  5346. + run= table_elem.vlc.run;\
  5347. + SKIP_BITS(name, gb, n)\
  5348. +}
  5349. +
  5350. +#else
  5351. #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
  5352. {\
  5353. int n, index, nb_bits;\
  5354. @@ -849,7 +918,7 @@ void free_vlc(VLC *vlc);
  5355. level = table[index].level;\
  5356. n = table[index].len;\
  5357. \
  5358. - if(max_depth > 1 && n < 0){\
  5359. + if(max_depth > 1 && n < 0 ){\
  5360. SKIP_BITS(name, gb, bits)\
  5361. if(need_update){\
  5362. UPDATE_CACHE(name, gb)\
  5363. @@ -864,7 +933,7 @@ void free_vlc(VLC *vlc);
  5364. run= table[index].run;\
  5365. SKIP_BITS(name, gb, n)\
  5366. }
  5367. -
  5368. +#endif
  5369. /**
  5370. * parses a vlc code, faster then get_vlc()
  5371. --- a/libavcodec/dsputil.c
  5372. +++ b/libavcodec/dsputil.c
  5373. @@ -4155,6 +4155,7 @@ void dsputil_init(DSPContext* c, AVCodec
  5374. if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
  5375. if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
  5376. + if (ENABLE_AVR32) dsputil_init_avr32 (c, avctx);
  5377. if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
  5378. if (ENABLE_VIS) dsputil_init_vis (c, avctx);
  5379. if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
  5380. --- a/libavcodec/h264.c
  5381. +++ b/libavcodec/h264.c
  5382. @@ -2043,7 +2043,12 @@ static void free_tables(H264Context *h){
  5383. static void init_dequant8_coeff_table(H264Context *h){
  5384. int i,q,x;
  5385. +#ifdef ARCH_AVR32
  5386. + const int transpose = 0;
  5387. +#else
  5388. const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
  5389. +#endif
  5390. +
  5391. h->dequant8_coeff[0] = h->dequant8_buffer[0];
  5392. h->dequant8_coeff[1] = h->dequant8_buffer[1];
  5393. @@ -2066,7 +2071,13 @@ static void init_dequant8_coeff_table(H2
  5394. static void init_dequant4_coeff_table(H264Context *h){
  5395. int i,j,q,x;
  5396. + // Yes this is ugly as hell....
  5397. +#ifdef ARCH_AVR32
  5398. + const int transpose = 0;
  5399. +#else
  5400. const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
  5401. +#endif
  5402. +
  5403. for(i=0; i<6; i++ ){
  5404. h->dequant4_coeff[i] = h->dequant4_buffer[i];
  5405. for(j=0; j<i; j++){
  5406. @@ -3710,7 +3721,11 @@ static int init_poc(H264Context *h){
  5407. static void init_scan_tables(H264Context *h){
  5408. MpegEncContext * const s = &h->s;
  5409. int i;
  5410. +#ifdef ARCH_AVR32
  5411. + if(1){
  5412. +#else
  5413. if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
  5414. +#endif
  5415. memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
  5416. memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t));
  5417. }else{
  5418. --- a/libavutil/common.h
  5419. +++ b/libavutil/common.h
  5420. @@ -174,23 +174,39 @@ static inline int mid_pred(int a, int b,
  5421. * @param amax maximum value of the clip range
  5422. * @return clipped value
  5423. */
  5424. +#if defined(ARCH_AVR32)
  5425. +#define av_clip(a, amin, amax) \
  5426. + ({ int __tmp__; \
  5427. + asm ("min\t%0, %1, %2\n" \
  5428. + "max\t%0, %0, %3\n" \
  5429. + : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \
  5430. + __tmp__; })
  5431. +#else
  5432. static inline int av_clip(int a, int amin, int amax)
  5433. {
  5434. if (a < amin) return amin;
  5435. else if (a > amax) return amax;
  5436. else return a;
  5437. }
  5438. +#endif
  5439. /**
  5440. * clip a signed integer value into the 0-255 range
  5441. * @param a value to clip
  5442. * @return clipped value
  5443. */
  5444. +#if defined(ARCH_AVR32)
  5445. +#define av_clip_uint8(a) \
  5446. + ({ int __tmp__ = a; \
  5447. + asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \
  5448. + __tmp__; })
  5449. +#else
  5450. static inline uint8_t av_clip_uint8(int a)
  5451. {
  5452. if (a&(~255)) return (-a)>>31;
  5453. else return a;
  5454. }
  5455. +#endif
  5456. /**
  5457. * clip a signed integer value into the -32768,32767 range
  5458. --- a/libfaad2/common.h
  5459. +++ b/libfaad2/common.h
  5460. @@ -69,7 +69,7 @@ extern "C" {
  5461. /* Use if target platform has address generators with autoincrement */
  5462. //#define PREFER_POINTERS
  5463. -#if defined(_WIN32_WCE) || defined(__arm__)
  5464. +#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__)
  5465. #define FIXED_POINT
  5466. #endif
  5467. --- a/libmpcodecs/ad_libmad.c
  5468. +++ b/libmpcodecs/ad_libmad.c
  5469. @@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){
  5470. sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2;
  5471. sh->samplerate=this->frame.header.samplerate;
  5472. sh->i_bps=this->frame.header.bitrate/8;
  5473. +#ifdef WORDS_BIGENDIAN
  5474. + sh->sample_format = AF_FORMAT_S16_BE;
  5475. +#else
  5476. + sh->sample_format = AF_FORMAT_S16_LE;
  5477. +#endif
  5478. sh->samplesize=2;
  5479. return 1;
  5480. --- /dev/null
  5481. +++ b/libswscale/pico-avr32.h
  5482. @@ -0,0 +1,137 @@
  5483. +/*
  5484. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  5485. + *
  5486. + * Redistribution and use in source and binary forms, with or without
  5487. + * modification, are permitted provided that the following conditions
  5488. + * are met:
  5489. + *
  5490. + * 1. Redistributions of source code must retain the above copyright
  5491. + * notice, this list of conditions and the following disclaimer.
  5492. + *
  5493. + * 2. Redistributions in binary form must reproduce the above
  5494. + * copyright notice, this list of conditions and the following
  5495. + * disclaimer in the documentation and/or other materials provided
  5496. + * with the distribution.
  5497. + *
  5498. + * 3. The name of ATMEL may not be used to endorse or promote products
  5499. + * derived from this software without specific prior written
  5500. + * permission.
  5501. + *
  5502. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  5503. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  5504. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  5505. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  5506. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  5507. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  5508. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  5509. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  5510. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  5511. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  5512. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  5513. + * DAMAGE.
  5514. + */
  5515. +#ifndef __PICO_H__
  5516. +#define __PICO_H__
  5517. +
  5518. +/* Coprocessor Number */
  5519. +#define PICO_CPNO 1
  5520. +
  5521. +/* Pixel Coprocessor Register file */
  5522. +#define PICO_REGVECT_INPIX2 cr0
  5523. +#define PICO_REGVECT_INPIX1 cr1
  5524. +#define PICO_REGVECT_INPIX0 cr2
  5525. +#define PICO_REGVECT_OUTPIX2 cr3
  5526. +#define PICO_REGVECT_OUTPIX1 cr4
  5527. +#define PICO_REGVECT_OUTPIX0 cr5
  5528. +#define PICO_REGVECT_COEFF0_A cr6
  5529. +#define PICO_REGVECT_COEFF0_B cr7
  5530. +#define PICO_REGVECT_COEFF1_A cr8
  5531. +#define PICO_REGVECT_COEFF1_B cr9
  5532. +#define PICO_REGVECT_COEFF2_A cr10
  5533. +#define PICO_REGVECT_COEFF2_B cr11
  5534. +#define PICO_REGVECT_VMU0_OUT cr12
  5535. +#define PICO_REGVECT_VMU1_OUT cr13
  5536. +#define PICO_REGVECT_VMU2_OUT cr14
  5537. +#define PICO_REGVECT_CONFIG cr15
  5538. +
  5539. +#define PICO_INPIX2 0
  5540. +#define PICO_INPIX1 1
  5541. +#define PICO_INPIX0 2
  5542. +#define PICO_OUTPIX2 3
  5543. +#define PICO_OUTPIX1 4
  5544. +#define PICO_OUTPIX0 5
  5545. +#define PICO_COEFF0_A 6
  5546. +#define PICO_COEFF0_B 7
  5547. +#define PICO_COEFF1_A 8
  5548. +#define PICO_COEFF1_B 9
  5549. +#define PICO_COEFF2_A 10
  5550. +#define PICO_COEFF2_B 11
  5551. +#define PICO_VMU0_OUT 12
  5552. +#define PICO_VMU1_OUT 13
  5553. +#define PICO_VMU2_OUT 14
  5554. +#define PICO_CONFIG 15
  5555. +
  5556. +/* Config Register */
  5557. +#define PICO_COEFF_FRAC_BITS 0
  5558. +#define PICO_COEFF_FRAC_BITS_WIDTH 4
  5559. +#define PICO_OFFSET_FRAC_BITS 4
  5560. +#define PICO_OFFSET_FRAC_BITS_WIDTH 4
  5561. +#define PICO_INPUT_MODE 8
  5562. +#define PICO_INPUT_MODE_WIDTH 2
  5563. +#define PICO_OUTPUT_MODE 10
  5564. +
  5565. +#define PICO_TRANSFORMATION_MODE 0
  5566. +#define PICO_HOR_FILTER_MODE 1
  5567. +#define PICO_VERT_FILTER_MODE 2
  5568. +
  5569. +#define PICO_PLANAR_MODE 1
  5570. +#define PICO_PACKED_MODE 0
  5571. +
  5572. +/* Bits in coefficients */
  5573. +#define PICO_COEFF_BITS 12
  5574. +
  5575. +/* Operation bits */
  5576. +#define PICO_USE_ACC (1 << 2)
  5577. +#define PICO_SINGLE_VECTOR (1 << 3)
  5578. +
  5579. +
  5580. +#define __str(x...) #x
  5581. +#define __xstr(x...) __str(x)
  5582. +
  5583. +#define PICO_PUT_W(pico_reg, x) \
  5584. + __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
  5585. +#define PICO_GET_W(pico_reg) \
  5586. + __builtin_mvcr_w(PICO_CPNO, pico_reg)
  5587. +
  5588. +#define PICO_PUT_D(pico_reg, x) \
  5589. + __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
  5590. +#define PICO_GET_D(pico_reg) \
  5591. + __builtin_mvcr_d(PICO_CPNO, pico_reg)
  5592. +
  5593. +
  5594. +#define PICO_STCM_W(ptr, pico_regs...) \
  5595. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5596. +#define PICO_STCM_D(ptr, pico_regs...) \
  5597. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5598. +
  5599. +#define PICO_STCM_W_DEC(ptr, pico_regs...) \
  5600. + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5601. +#define PICO_STCM_D_DEC(ptr, pico_regs...) \
  5602. + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
  5603. +
  5604. +#define PICO_LDCM_W(ptr, pico_regs...) \
  5605. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5606. +#define PICO_LDCM_D(ptr, pico_regs...) \
  5607. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
  5608. +
  5609. +#define PICO_LDCM_W_INC(ptr, pico_regs...) \
  5610. + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5611. +#define PICO_LDCM_D_INC(ptr, pico_regs...) \
  5612. + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
  5613. +
  5614. +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
  5615. + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
  5616. +
  5617. +
  5618. +#endif
  5619. +
  5620. --- a/libswscale/swscale_internal.h
  5621. +++ b/libswscale/swscale_internal.h
  5622. @@ -181,7 +181,7 @@ typedef struct SwsContext{
  5623. SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
  5624. int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
  5625. -char *sws_format_name(int format);
  5626. +char *sws_format_name(enum PixelFormat format);
  5627. //FIXME replace this with something faster
  5628. #define isPlanarYUV(x) ( \
  5629. --- a/libswscale/yuv2rgb.c
  5630. +++ b/libswscale/yuv2rgb.c
  5631. @@ -47,6 +47,10 @@
  5632. #include "yuv2rgb_mlib.c"
  5633. #endif
  5634. +#ifdef ARCH_AVR32
  5635. +#include "yuv2rgb_avr32.c"
  5636. +#endif
  5637. +
  5638. #define DITHER1XBPP // only for mmx
  5639. const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
  5640. @@ -646,6 +650,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext
  5641. if (t) return t;
  5642. }
  5643. #endif
  5644. +#ifdef ARCH_AVR32
  5645. + {
  5646. + SwsFunc t= yuv2rgb_init_avr32(c);
  5647. + if(t) return t;
  5648. + }
  5649. +#endif
  5650. #ifdef HAVE_ALTIVEC
  5651. if (c->flags & SWS_CPU_CAPS_ALTIVEC)
  5652. {
  5653. @@ -736,6 +746,10 @@ int yuv2rgb_c_init_tables (SwsContext *c
  5654. //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
  5655. oy -= 256*brightness;
  5656. +#ifdef ARCH_AVR32
  5657. + yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation);
  5658. +#endif
  5659. +
  5660. for (i = 0; i < 1024; i++) {
  5661. int j;
  5662. --- /dev/null
  5663. +++ b/libswscale/yuv2rgb_avr32.c
  5664. @@ -0,0 +1,411 @@
  5665. +/*
  5666. + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
  5667. + *
  5668. + * Redistribution and use in source and binary forms, with or without
  5669. + * modification, are permitted provided that the following conditions
  5670. + * are met:
  5671. + *
  5672. + * 1. Redistributions of source code must retain the above copyright
  5673. + * notice, this list of conditions and the following disclaimer.
  5674. + *
  5675. + * 2. Redistributions in binary form must reproduce the above
  5676. + * copyright notice, this list of conditions and the following
  5677. + * disclaimer in the documentation and/or other materials provided
  5678. + * with the distribution.
  5679. + *
  5680. + * 3. The name of ATMEL may not be used to endorse or promote products
  5681. + * derived from this software without specific prior written
  5682. + * permission.
  5683. + *
  5684. + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
  5685. + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  5686. + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  5687. + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
  5688. + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  5689. + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  5690. + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  5691. + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  5692. + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  5693. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  5694. + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  5695. + * DAMAGE.
  5696. + */
  5697. +#include "pico-avr32.h"
  5698. +#include "log.h"
  5699. +
  5700. +#define RGB(uv_part) \
  5701. + __asm__ volatile ( \
  5702. + "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \
  5703. + "ld.w\t%1, %4[%8:" uv_part " << 2]\n\t" /* g = c->table_gU[U] */ \
  5704. + "ld.w\t%2, %5[%8:" uv_part " << 2]\n\t" /* b = c->table_bU[U] */ \
  5705. + "add\t%1, %0\n\t" /* g += tmp */\
  5706. + "ld.w\t%0, %6[%7:" uv_part " << 2]" /* r = c->table_rV[V] */ \
  5707. + : "=&r" (r), "=&r" (g), "=&r" (b) \
  5708. + : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \
  5709. + "r" (&c->table_rV[0]), "r" (V), "r" (U));
  5710. +
  5711. +#undef YUV2RGB1
  5712. +#define YUV2RGB1(dst, src, y, idx) \
  5713. + { int tmp2; __asm__ volatile ( \
  5714. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5715. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5716. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5717. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
  5718. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5719. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5720. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5721. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
  5722. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5723. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5724. + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
  5725. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5726. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5727. + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
  5728. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5729. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5730. +
  5731. +#undef YUV2RGB2
  5732. +#define YUV2RGB2(dst, src, y, idx) \
  5733. + { int tmp2; __asm__ volatile ( \
  5734. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5735. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5736. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5737. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
  5738. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5739. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5740. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5741. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
  5742. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5743. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5744. + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
  5745. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5746. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5747. + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
  5748. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5749. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5750. +
  5751. +
  5752. +#undef YUV2BGR1
  5753. +#define YUV2BGR1(dst, src, y, idx) \
  5754. + { int tmp2; __asm__ volatile ( \
  5755. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5756. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5757. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5758. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
  5759. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5760. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5761. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5762. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
  5763. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5764. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5765. + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
  5766. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5767. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5768. + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
  5769. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5770. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5771. +
  5772. +#undef YUV2BGR2
  5773. +#define YUV2BGR2(dst, src, y, idx) \
  5774. + { int tmp2; __asm__ volatile ( \
  5775. + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5776. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
  5777. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
  5778. + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
  5779. + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
  5780. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
  5781. + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
  5782. + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
  5783. + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
  5784. + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
  5785. + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
  5786. + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
  5787. + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
  5788. + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
  5789. + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
  5790. + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
  5791. +
  5792. +int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  5793. + int srcSliceH, uint8_t* dst[], int dstStride[]){
  5794. + int y;
  5795. +
  5796. + if(c->srcFormat == PIX_FMT_YUV422P){
  5797. + srcStride[1] *= 2;
  5798. + srcStride[2] *= 2;
  5799. + }
  5800. +
  5801. +
  5802. + for(y=0; y<srcSliceH; y+=2){
  5803. + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
  5804. + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
  5805. + uint32_t *r, *g, *b;
  5806. + uint8_t *py_1= src[0] + y*srcStride[0];
  5807. + uint8_t *py_2= py_1 + srcStride[0];
  5808. + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
  5809. + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
  5810. + unsigned int h_size= c->dstW>>3;
  5811. + while (h_size--) {
  5812. + uint32_t U, V, Y1, Y2, tmp;
  5813. + U = ((uint32_t*)pu)[0];
  5814. + V = ((uint32_t*)pv)[0];
  5815. +
  5816. + RGB("t")
  5817. + YUV2BGR1(dst_1, py_1, Y1, 0)
  5818. + YUV2BGR1(dst_2, py_2, Y2, 0)
  5819. +
  5820. + RGB("u")
  5821. + YUV2BGR2(dst_1, py_1, Y1, 1)
  5822. + YUV2BGR2(dst_2, py_2, Y2, 1)
  5823. +
  5824. + RGB("l")
  5825. + YUV2BGR1(dst_1, py_1, Y1, 2)
  5826. + YUV2BGR1(dst_2, py_2, Y2, 2)
  5827. +
  5828. + RGB("b")
  5829. + YUV2BGR2(dst_1, py_1, Y1, 3)
  5830. + YUV2BGR2(dst_2, py_2, Y2, 3)
  5831. +
  5832. + pu += 4;
  5833. + pv += 4;
  5834. + py_1 += 8;
  5835. + py_2 += 8;
  5836. + dst_1 += 24;
  5837. + dst_2 += 24;
  5838. + }
  5839. + }
  5840. + return srcSliceH;
  5841. +}
  5842. +
  5843. +
  5844. +
  5845. +static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  5846. + int srcSliceH, uint8_t* dst[], int dstStride[]){
  5847. + int y;
  5848. +
  5849. + if(c->srcFormat == PIX_FMT_YUV422P){
  5850. + srcStride[1] *= 2;
  5851. + srcStride[2] *= 2;
  5852. + }
  5853. + for(y=0; y<srcSliceH; y+=2){
  5854. + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
  5855. + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
  5856. + uint8_t *r, *g, *b;
  5857. + uint8_t *py_1= src[0] + y*srcStride[0];
  5858. + uint8_t *py_2= py_1 + srcStride[0];
  5859. + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
  5860. + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
  5861. + unsigned int h_size= c->dstW>>3;
  5862. + while (h_size--) {
  5863. + uint32_t U, V, Y1, Y2, tmp;
  5864. + U = ((uint32_t*)pu)[0];
  5865. + V = ((uint32_t*)pv)[0];
  5866. +
  5867. + RGB("t")
  5868. + YUV2RGB1(dst_1, py_1, Y1, 0)
  5869. + YUV2RGB1(dst_2, py_2, Y2, 0)
  5870. +
  5871. + RGB("u")
  5872. + YUV2RGB2(dst_1, py_1, Y1, 1)
  5873. + YUV2RGB2(dst_2, py_2, Y2, 1)
  5874. +
  5875. + RGB("l")
  5876. + YUV2RGB1(dst_1, py_1, Y1, 2)
  5877. + YUV2RGB1(dst_2, py_2, Y2, 2)
  5878. +
  5879. + RGB("b")
  5880. + YUV2RGB2(dst_1, py_1, Y1, 3)
  5881. + YUV2RGB2(dst_2, py_2, Y2, 3)
  5882. +
  5883. + pu += 4;
  5884. + pv += 4;
  5885. + py_1 += 8;
  5886. + py_2 += 8;
  5887. + dst_1 += 24;
  5888. + dst_2 += 24;
  5889. + }
  5890. + }
  5891. + return srcSliceH;
  5892. +}
  5893. +
  5894. +#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits)
  5895. +#define COEFF_FRAC_BITS 9
  5896. +#define OFFSET_FRAC_BITS 2
  5897. +
  5898. +/* Coefficients used in the pico */
  5899. +static struct {
  5900. + short coeff2_2;
  5901. + short coeff2_3;
  5902. + short coeff2_0;
  5903. + short coeff2_1;
  5904. + short coeff1_2;
  5905. + short coeff1_3;
  5906. + short coeff1_0;
  5907. + short coeff1_1;
  5908. + short coeff0_2;
  5909. + short coeff0_3;
  5910. + short coeff0_0;
  5911. + short coeff0_1;
  5912. +} pico_coeff;
  5913. +
  5914. +
  5915. +static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
  5916. + int srcSliceH, uint8_t* dst[], int dstStride[]){
  5917. + int y;
  5918. + static int first_time = 1;
  5919. +
  5920. + /* Initialize pico */
  5921. + PICO_LDCM_D(&pico_coeff,
  5922. + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
  5923. + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
  5924. + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B);
  5925. +
  5926. + PICO_PUT_W(PICO_CONFIG,
  5927. + (PICO_PACKED_MODE << PICO_OUTPUT_MODE
  5928. + | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE
  5929. + | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS
  5930. + | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS));
  5931. +
  5932. +
  5933. + if(c->srcFormat == PIX_FMT_YUV422P){
  5934. + srcStride[1] *= 2;
  5935. + srcStride[2] *= 2;
  5936. + }
  5937. +
  5938. + for(y=0; y<srcSliceH; y+=2){
  5939. + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
  5940. + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
  5941. + uint8_t *r, *g, *b;
  5942. + uint8_t *py_1= src[0] + y*srcStride[0];
  5943. + uint8_t *py_2= py_1 + srcStride[0];
  5944. + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
  5945. + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
  5946. + unsigned int h_size= c->dstW>>3;
  5947. + int *py_1_int = (int *)py_1;
  5948. + int *py_2_int = (int *)py_2;
  5949. + int *pu_int = (int *)pu;
  5950. + int *pv_int = (int *)pv;
  5951. + while (h_size--) {
  5952. + PICO_PUT_W(PICO_INPIX0, *py_1_int++);
  5953. + PICO_PUT_W(PICO_INPIX1, *pu_int++);
  5954. + PICO_PUT_W(PICO_INPIX2, *pv_int++);
  5955. + PICO_OP(0, 0, 0, 4, 8);
  5956. + PICO_OP(0, 1, 1, 4, 8);
  5957. + PICO_OP(0, 2, 2, 5, 9);
  5958. + PICO_OP(0, 3, 3, 5, 9);
  5959. + PICO_PUT_W(PICO_INPIX0, *py_1_int++);
  5960. + PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  5961. + PICO_OP(0, 0, 0, 6, 10);
  5962. + PICO_OP(0, 1, 1, 6, 10);
  5963. + PICO_OP(0, 2, 2, 7, 11);
  5964. + PICO_OP(0, 3, 3, 7, 11);
  5965. + PICO_PUT_W(PICO_INPIX0, *py_2_int++);
  5966. + PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  5967. +
  5968. + PICO_OP(0, 0, 0, 4, 8);
  5969. + PICO_OP(0, 1, 1, 4, 8);
  5970. + PICO_OP(0, 2, 2, 5, 9);
  5971. + PICO_OP(0, 3, 3, 5, 9);
  5972. + PICO_PUT_W(PICO_INPIX0, *py_2_int++);
  5973. + PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  5974. + PICO_OP(0, 0, 0, 6, 10);
  5975. + PICO_OP(0, 1, 1, 6, 10);
  5976. + PICO_OP(0, 2, 2, 7, 11);
  5977. + PICO_OP(0, 3, 3, 7, 11);
  5978. + PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
  5979. +
  5980. + dst_1 += 24;
  5981. + dst_2 += 24;
  5982. + }
  5983. + }
  5984. + return srcSliceH;
  5985. +}
  5986. +
  5987. +extern int avr32_use_pico;
  5988. +
  5989. +SwsFunc yuv2rgb_init_avr32 (SwsContext *c){
  5990. + switch(c->dstFormat){
  5991. + case PIX_FMT_BGR24:
  5992. + {
  5993. + if ( avr32_use_pico ){
  5994. + av_log(c, AV_LOG_INFO, "AVR32 BGR24: Using PICO for color space conversion\n");
  5995. + return yuv2bgr24_avr32_pico;
  5996. + } else {
  5997. + av_log(c, AV_LOG_INFO, "AVR32 BGR24: Using optimized color space conversion\n");
  5998. + return yuv2bgr24_avr32;
  5999. + }
  6000. + }
  6001. + break;
  6002. + case PIX_FMT_RGB24:
  6003. + {
  6004. + if ( avr32_use_pico ){
  6005. + av_log(c, AV_LOG_INFO, "AVR32 RGB24: Using PICO for color space conversion\n");
  6006. + return yuv2bgr24_avr32_pico;
  6007. + } else {
  6008. + av_log(c, AV_LOG_INFO, "AVR32 RGB24: Using optimized color space conversion\n");
  6009. + return yuv2rgb24_avr32;
  6010. + }
  6011. + }
  6012. + }
  6013. + return NULL;
  6014. +}
  6015. +
  6016. +
  6017. +int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){
  6018. + const int isRgb = (c->dstFormat == PIX_FMT_RGB24);
  6019. +
  6020. + int64_t crv = inv_table[0];
  6021. + int64_t cbu = inv_table[1];
  6022. + int64_t cgu = -inv_table[2];
  6023. + int64_t cgv = -inv_table[3];
  6024. + int64_t cy = 1<<16;
  6025. + int64_t oy = 0;
  6026. +
  6027. + if(!fullRange){
  6028. + cy= (cy*255) / 219;
  6029. + oy= 16<<16;
  6030. + }
  6031. +
  6032. + cy = (cy *contrast )>>16;
  6033. + crv= (crv*contrast * saturation)>>32;
  6034. + cbu= (cbu*contrast * saturation)>>32;
  6035. + cgu= (cgu*contrast * saturation)>>32;
  6036. + cgv= (cgv*contrast * saturation)>>32;
  6037. +
  6038. + oy -= 256*brightness;
  6039. +
  6040. + pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */
  6041. + pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */
  6042. + pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */
  6043. + pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS)
  6044. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */
  6045. +
  6046. + if ( isRgb ){
  6047. + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
  6048. + pico_coeff.coeff0_1 = 0; /* R <- U */
  6049. + pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
  6050. + pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
  6051. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
  6052. +
  6053. + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
  6054. + pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
  6055. + pico_coeff.coeff2_2 = 0; /* B <- V */
  6056. + pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
  6057. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */
  6058. + } else {
  6059. + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
  6060. + pico_coeff.coeff2_1 = 0; /* R <- U */
  6061. + pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
  6062. + pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
  6063. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
  6064. +
  6065. + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
  6066. + pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
  6067. + pico_coeff.coeff0_2 = 0; /* B <- V */
  6068. + pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
  6069. + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */
  6070. + }
  6071. +
  6072. +}
  6073. +
  6074. +
  6075. +#undef RGB
  6076. --- a/libvo/vo_fbdev2.c
  6077. +++ b/libvo/vo_fbdev2.c
  6078. @@ -22,6 +22,9 @@
  6079. #include "sub.h"
  6080. #include "mp_msg.h"
  6081. +/* Draw directly to framebuffer */
  6082. +#define USE_CONVERT2FB
  6083. +
  6084. static vo_info_t info = {
  6085. "Framebuffer Device",
  6086. "fbdev2",
  6087. @@ -178,6 +181,15 @@ static int fb_preinit(int reset)
  6088. }
  6089. fb_orig_vinfo = fb_vinfo;
  6090. + /* Reset panning offset */
  6091. + fb_vinfo.yoffset = 0;
  6092. + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
  6093. + mp_msg(MSGT_VO, MSGL_ERR,
  6094. + "[fbdev2] FBIOPAN_DISPLAY failed: %s\n",
  6095. + strerror(errno));
  6096. + return 0;
  6097. + }
  6098. +
  6099. fb_bpp = fb_vinfo.bits_per_pixel;
  6100. /* 16 and 15 bpp is reported as 16 bpp */
  6101. @@ -289,6 +301,10 @@ static int config(uint32_t width, uint32
  6102. mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno));
  6103. return 1;
  6104. }
  6105. +#else
  6106. + if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2)
  6107. + && fb_vinfo.yoffset == 0)
  6108. + center += fb_line_len * fb_vinfo.yres;
  6109. #endif
  6110. if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres);
  6111. @@ -299,14 +315,22 @@ static int query_format(uint32_t format)
  6112. {
  6113. // open the device, etc.
  6114. if (fb_preinit(0)) return 0;
  6115. - if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) {
  6116. + if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) {
  6117. int fb_target_bpp = format & 0xff;
  6118. set_bpp(&fb_vinfo, fb_target_bpp);
  6119. fb_vinfo.xres_virtual = fb_vinfo.xres;
  6120. - fb_vinfo.yres_virtual = fb_vinfo.yres;
  6121. + fb_vinfo.yres_virtual = fb_vinfo.yres * 2;
  6122. if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
  6123. - mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno));
  6124. - return 0;
  6125. + mp_msg(MSGT_VO, MSGL_WARN,
  6126. + "[fbdev2] Can't double virtual y resolution: %s\n",
  6127. + strerror(errno));
  6128. + fb_vinfo.yres_virtual = fb_vinfo.yres;
  6129. + if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
  6130. + mp_msg(MSGT_VO, MSGL_ERR,
  6131. + "[fbdev2] Can't put VSCREENINFO: %s\n",
  6132. + strerror(errno));
  6133. + return -1;
  6134. + }
  6135. }
  6136. fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
  6137. fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length +
  6138. @@ -367,16 +391,67 @@ static void check_events(void)
  6139. static void flip_page(void)
  6140. {
  6141. -#ifndef USE_CONVERT2FB
  6142. int i, out_offset = 0, in_offset = 0;
  6143. - for (i = 0; i < in_height; i++) {
  6144. - fast_memcpy(center + out_offset, next_frame + in_offset,
  6145. - in_width * fb_pixel_size);
  6146. - out_offset += fb_line_len;
  6147. - in_offset += in_width * fb_pixel_size;
  6148. - }
  6149. +#ifndef USE_CONVERT2FB
  6150. + if (1) {
  6151. +#else
  6152. + if (fb_vinfo.yres_virtual == fb_vinfo.yres) {
  6153. #endif
  6154. + for (i = 0; i < in_height; i++) {
  6155. + fast_memcpy(center + out_offset, next_frame + in_offset,
  6156. + in_width * fb_pixel_size);
  6157. + out_offset += fb_line_len;
  6158. + in_offset += in_width * fb_pixel_size;
  6159. + }
  6160. + } else {
  6161. + if (fb_vinfo.yoffset == 0) {
  6162. + fb_vinfo.yoffset += fb_vinfo.yres;
  6163. + center -= fb_line_len * fb_vinfo.yres;
  6164. + } else {
  6165. + fb_vinfo.yoffset = 0;
  6166. + center += fb_line_len * fb_vinfo.yres;
  6167. + }
  6168. +
  6169. + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
  6170. + mp_msg(MSGT_VO, MSGL_ERR,
  6171. + "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n",
  6172. + strerror(errno));
  6173. + }
  6174. + }
  6175. +}
  6176. +
  6177. +static uint32_t get_image(mp_image_t *mpi)
  6178. +{
  6179. + if(mpi->flags&MP_IMGFLAG_READABLE)
  6180. + return VO_FALSE; // slow video ram
  6181. + if(mpi->type==MP_IMGTYPE_STATIC)
  6182. + return VO_FALSE; // it is not static
  6183. +
  6184. + if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) {
  6185. + // we're lucky or codec accepts stride => ok, let's go!
  6186. +
  6187. + //YUY2 and RGB formats
  6188. + mpi->planes[0] = center;
  6189. + mpi->width = in_width;
  6190. + mpi->stride[0] = fb_line_len;
  6191. +
  6192. + // center image
  6193. +
  6194. + mpi->flags |= MP_IMGFLAG_DIRECT;
  6195. +
  6196. + return VO_TRUE;
  6197. + }
  6198. +
  6199. + return VO_FALSE;
  6200. +}
  6201. +
  6202. +static uint32_t put_image(mp_image_t *mpi)
  6203. +{
  6204. + // already out?
  6205. + if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK)))
  6206. + return VO_TRUE;
  6207. + return VO_FALSE;
  6208. }
  6209. static void uninit(void)
  6210. @@ -403,6 +478,10 @@ static int control(uint32_t request, voi
  6211. switch (request) {
  6212. case VOCTRL_QUERY_FORMAT:
  6213. return query_format(*((uint32_t*)data));
  6214. + case VOCTRL_GET_IMAGE:
  6215. + return get_image(data);
  6216. + case VOCTRL_DRAW_IMAGE:
  6217. + return put_image(data);
  6218. }
  6219. return VO_NOTIMPL;
  6220. }
  6221. --- a/version.sh
  6222. +++ b/version.sh
  6223. @@ -1,3 +1,3 @@
  6224. #!/bin/sh
  6225. -echo "#define VERSION \"1.0rc2-$1\"" > version.h
  6226. -echo "#define MP_TITLE \"MPlayer 1.0rc2-$1 (C) 2000-2007 MPlayer Team\"" >> version.h
  6227. +echo "#define VERSION \"1.0rc2.atmel.1-$1\"" > version.h
  6228. +echo "#define MP_TITLE \"MPlayer 1.0rc2.atmel.1-$1 (C) 2000-2007 MPlayer Team\"" >> version.h